Skip to content

Commit

Permalink
Merge pull request #19 from tikazyq/release
Browse files Browse the repository at this point in the history
Release
  • Loading branch information
Marvin Zhang authored Apr 15, 2019
2 parents 37f5889 + 7f7bb97 commit 746395a
Show file tree
Hide file tree
Showing 86 changed files with 19,530 additions and 786 deletions.
27 changes: 0 additions & 27 deletions LICENSE

This file was deleted.

4 changes: 4 additions & 0 deletions README-zh.md
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,10 @@ npm run serve

![spider-list](./docs/img/screenshot-task-detail-results.png)

## 使用流程

![user-process](./docs/img/用户使用流程图.png)

## 架构

Crawlab的架构跟Celery非常相似,但是加入了包括前端、爬虫、Flower在内的额外模块,以支持爬虫管理的功能。
Expand Down
22 changes: 11 additions & 11 deletions crawlab/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@
from flask import Flask
from flask_cors import CORS
from flask_restful import Api

# from flask_restplus import Api
from utils.log import other
from constants.node import NodeStatus
from db.manager import db_manager
from routes.schedules import ScheduleApi
Expand Down Expand Up @@ -42,23 +43,22 @@
'/api/nodes',
'/api/nodes/<string:id>',
'/api/nodes/<string:id>/<string:action>')
api.add_resource(SpiderImportApi,
'/api/spiders/import/<string:platform>')
api.add_resource(SpiderManageApi,
'/api/spiders/manage/<string:action>')
api.add_resource(SpiderApi,
'/api/spiders',
'/api/spiders/<string:id>',
'/api/spiders/<string:id>/<string:action>')
api.add_resource(SpiderImportApi,
'/api/spiders/import/<string:platform>')
api.add_resource(SpiderManageApi,
'/api/spiders/manage/<string:action>')
api.add_resource(TaskApi,
'/api/tasks',
'/api/tasks/<string:id>',
'/api/tasks/<string:id>/<string:action>')
api.add_resource(DeployApi,
'/api/deploys',
'/api/deploys/<string:id>',
'/api/deploys/<string:id>/<string:action>')
api.add_resource(TaskApi,
'/api/tasks',
'/api/tasks/<string:id>',
'/api/tasks/<string:id>/<string:action>'
)
api.add_resource(FileApi,
'/api/files',
'/api/files/<string:action>')
Expand All @@ -78,7 +78,7 @@ def update_nodes_status(event):
})

def update_nodes_status_online(event):
print(event)
other.info(f"{event}")

with celery_app.connection() as connection:
recv = celery_app.events.Receiver(connection, handlers={
Expand Down
4 changes: 2 additions & 2 deletions crawlab/bin/run_flower.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,11 @@
file_dir = os.path.dirname(os.path.realpath(__file__))
root_path = os.path.abspath(os.path.join(file_dir, '..'))
sys.path.append(root_path)

from utils.log import other
from config import BROKER_URL

if __name__ == '__main__':
p = subprocess.Popen(['celery', 'flower', '-b', BROKER_URL], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
for line in iter(p.stdout.readline, 'b'):
if line.decode('utf-8') != '':
print(line.decode('utf-8'))
other.info(line.decode('utf-8'))
17 changes: 11 additions & 6 deletions crawlab/config.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,17 @@
# project variables
PROJECT_SOURCE_FILE_FOLDER = '/Users/yeqing/projects/crawlab/spiders'
PROJECT_DEPLOY_FILE_FOLDER = '/var/crawlab'
PROJECT_LOGS_FOLDER = '/var/logs/crawlab'
# 爬虫源码路径
PROJECT_SOURCE_FILE_FOLDER = '../spiders'
# 配置python虚拟环境的路径
PYTHON_ENV_PATH = '/Users/chennan/Desktop/2019/env/bin/python'
# 爬虫部署路径
PROJECT_DEPLOY_FILE_FOLDER = '../deployfile'

PROJECT_LOGS_FOLDER = '../deployfile/logs'
PROJECT_TMP_FOLDER = '/tmp'

# celery variables
BROKER_URL = 'redis://192.168.99.100:6379/0'
CELERY_RESULT_BACKEND = 'mongodb://192.168.99.100:27017/'
BROKER_URL = 'redis://127.0.0.1:6379/0'
CELERY_RESULT_BACKEND = 'mongodb://127.0.0.1:27017/'
CELERY_MONGODB_BACKEND_SETTINGS = {
'database': 'crawlab_test',
'taskmeta_collection': 'tasks_celery',
Expand All @@ -18,7 +23,7 @@
FLOWER_API_ENDPOINT = 'http://localhost:5555/api'

# database variables
MONGO_HOST = '192.168.99.100'
MONGO_HOST = '127.0.0.1'
MONGO_PORT = 27017
MONGO_DB = 'crawlab_test'

Expand Down
89 changes: 81 additions & 8 deletions crawlab/db/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,26 @@
from mongoengine import connect
from pymongo import MongoClient, DESCENDING
from config import MONGO_HOST, MONGO_PORT, MONGO_DB
from utils import is_object_id, jsonify
from utils import is_object_id

connect(db=MONGO_DB, host=MONGO_HOST, port=MONGO_PORT)


class DbManager(object):
__doc__ = """
Database Manager class for handling database CRUD actions.
"""

def __init__(self):
self.mongo = MongoClient(host=MONGO_HOST, port=MONGO_PORT)
self.db = self.mongo[MONGO_DB]

def save(self, col_name: str, item, **kwargs):
def save(self, col_name: str, item: dict, **kwargs) -> None:
"""
Save the item in the specified collection
:param col_name: collection name
:param item: item object
"""
col = self.db[col_name]

# in case some fields cannot be saved in MongoDB
Expand All @@ -21,15 +30,32 @@ def save(self, col_name: str, item, **kwargs):

col.save(item, **kwargs)

def remove(self, col_name: str, cond: dict, **kwargs):
def remove(self, col_name: str, cond: dict, **kwargs) -> None:
"""
Remove items given specified condition.
:param col_name: collection name
:param cond: condition or filter
"""
col = self.db[col_name]
col.remove(cond, **kwargs)

def update(self, col_name: str, cond: dict, values: dict, **kwargs):
"""
Update items given specified condition.
:param col_name: collection name
:param cond: condition or filter
:param values: values to update
"""
col = self.db[col_name]
col.update(cond, {'$set': values}, **kwargs)

def update_one(self, col_name: str, id: str, values: dict, **kwargs):
"""
Update an item given specified _id
:param col_name: collection name
:param id: _id
:param values: values to update
"""
col = self.db[col_name]
_id = id
if is_object_id(id):
Expand All @@ -38,14 +64,28 @@ def update_one(self, col_name: str, id: str, values: dict, **kwargs):
col.find_one_and_update({'_id': _id}, {'$set': values})

def remove_one(self, col_name: str, id: str, **kwargs):
"""
Remove an item given specified _id
:param col_name: collection name
:param id: _id
"""
col = self.db[col_name]
_id = id
if is_object_id(id):
_id = ObjectId(id)
col.remove({'_id': _id})

def list(self, col_name: str, cond: dict, sort_key=None, sort_direction=DESCENDING, skip: int = 0, limit: int = 100,
**kwargs):
**kwargs) -> list:
"""
Return a list of items given specified condition, sort_key, sort_direction, skip, and limit.
:param col_name: collection name
:param cond: condition or filter
:param sort_key: key to sort
:param sort_direction: sort direction
:param skip: skip number
:param limit: limit number
"""
if sort_key is None:
sort_key = '_i'
col = self.db[col_name]
Expand All @@ -54,11 +94,21 @@ def list(self, col_name: str, cond: dict, sort_key=None, sort_direction=DESCENDI
data.append(item)
return data

def _get(self, col_name: str, cond: dict):
def _get(self, col_name: str, cond: dict) -> dict:
"""
Get an item given specified condition.
:param col_name: collection name
:param cond: condition or filter
"""
col = self.db[col_name]
return col.find_one(cond)

def get(self, col_name: str, id):
def get(self, col_name: str, id: (ObjectId, str)) -> dict:
"""
Get an item given specified _id.
:param col_name: collection name
:param id: _id
"""
if type(id) == ObjectId:
_id = id
elif is_object_id(id):
Expand All @@ -67,28 +117,51 @@ def get(self, col_name: str, id):
_id = id
return self._get(col_name=col_name, cond={'_id': _id})

def get_one_by_key(self, col_name: str, key, value):
def get_one_by_key(self, col_name: str, key, value) -> dict:
"""
Get an item given key/value condition.
:param col_name: collection name
:param key: key
:param value: value
"""
return self._get(col_name=col_name, cond={key: value})

def count(self, col_name: str, cond):
def count(self, col_name: str, cond) -> int:
"""
Get total count of a collection given specified condition
:param col_name: collection name
:param cond: condition or filter
"""
col = self.db[col_name]
return col.count(cond)

def get_latest_version(self, spider_id, node_id):
"""
@deprecated
"""
col = self.db['deploys']
for item in col.find({'spider_id': ObjectId(spider_id), 'node_id': node_id}) \
.sort('version', DESCENDING):
return item.get('version')
return None

def get_last_deploy(self, spider_id):
"""
@deprecated
"""
col = self.db['deploys']
for item in col.find({'spider_id': ObjectId(spider_id)}) \
.sort('finish_ts', DESCENDING):
return item
return None

def aggregate(self, col_name: str, pipelines, **kwargs):
"""
Perform MongoDB col.aggregate action to aggregate stats given collection name and pipelines.
Reference: https://docs.mongodb.com/manual/reference/command/aggregate/
:param col_name: collection name
:param pipelines: pipelines
"""
col = self.db[col_name]
return col.aggregate(pipelines, **kwargs)

Expand Down
4 changes: 2 additions & 2 deletions crawlab/manage.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
from routes.stats import StatsApi
from routes.tasks import TaskApi
from tasks.celery import celery_app

from utils.log import other
# flask app instance
app = Flask(__name__)
app.config.from_object('config')
Expand Down Expand Up @@ -81,7 +81,7 @@ def run_flower():
p = subprocess.Popen(['celery', 'flower', '-b', BROKER_URL], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
for line in iter(p.stdout.readline, 'b'):
if line.decode('utf-8') != '':
print(line.decode('utf-8'))
other.info(line.decode('utf-8'))


def run_worker():
Expand Down
35 changes: 35 additions & 0 deletions crawlab/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
amqp==2.4.2
aniso8601==6.0.0
APScheduler==3.6.0
attrs==19.1.0
Babel==2.6.0
billiard==3.6.0.0
celery==4.3.0
certifi==2019.3.9
chardet==3.0.4
Click==7.0
coloredlogs==10.0
Flask==1.0.2
Flask-Cors==3.0.7
Flask-RESTful==0.3.7
flask-restplus==0.12.1
flower==0.9.3
humanfriendly==4.18
idna==2.8
itsdangerous==1.1.0
Jinja2==2.10
jsonschema==3.0.1
kombu==4.5.0
MarkupSafe==1.1.1
mongoengine==0.17.0
pymongo==3.7.2
pyrsistent==0.14.11
pytz==2018.9
redis==3.2.1
requests==2.21.0
six==1.12.0
tornado==5.1.1
tzlocal==1.5.1
urllib3==1.24.1
vine==1.3.0
Werkzeug==0.15.2
Loading

0 comments on commit 746395a

Please sign in to comment.