Skip to content

Commit

Permalink
Merge pull request #71 from tikazyq/develop
Browse files Browse the repository at this point in the history
Develop
  • Loading branch information
Marvin Zhang authored Jun 21, 2019
2 parents 35291fc + 89f3a87 commit f14b896
Show file tree
Hide file tree
Showing 6 changed files with 68 additions and 88 deletions.
8 changes: 7 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
# 0.2.4 (unreleased)
### Features / Enhancement
- **Documentation**: Better and much more detailed documentation.
- **Better Crontab**: Make crontab expression through crontab UI.
- **High Concurrency**: `gevent` + `flask` to support high concurrency. [#70](https://github.com/tikazyq/crawlab/issues/70)

### Bugs Fixes
- **MongoDB Auth**. Allow user to specify `authenticationDatabase` to connect to `mongodb`. [#68](https://github.com/tikazyq/crawlab/issues/68)
- **Windows Compatibility**. Added `eventlet` to `requirements.txt`. [#59](https://github.com/tikazyq/crawlab/issues/59)


# 0.2.3 (2019-06-12)
Expand All @@ -10,7 +16,7 @@
- **Upload Spider**: Allow user to upload Customized Spider to Crawlab.
- **Edit Fields on Preview**: Allow user to edit fields when previewing data in Configurable Spider.

### Bugs ###
### Bugs Fixes
- **Spiders Pagination**. Fixed pagination problem in spider page.

# 0.2.2 (2019-05-30)
Expand Down
16 changes: 9 additions & 7 deletions crawlab/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,19 +6,20 @@
from flask_cors import CORS
from flask_restful import Api
# from flask_restplus import Api
from routes.sites import SiteApi
from gevent import monkey, pywsgi

file_dir = os.path.dirname(os.path.realpath(__file__))
root_path = os.path.abspath(os.path.join(file_dir, '.'))
sys.path.append(root_path)

from utils.log import other
from constants.node import NodeStatus
from db.manager import db_manager
from routes.schedules import ScheduleApi
from tasks.celery import celery_app
from tasks.scheduler import scheduler

file_dir = os.path.dirname(os.path.realpath(__file__))
root_path = os.path.abspath(os.path.join(file_dir, '.'))
sys.path.append(root_path)

from config import FLASK_HOST, FLASK_PORT, PROJECT_LOGS_FOLDER
from routes.sites import SiteApi
from routes.deploys import DeployApi
from routes.files import FileApi
from routes.nodes import NodeApi
Expand Down Expand Up @@ -103,4 +104,5 @@ def update_nodes_status_online(event):

if __name__ == '__main__':
# run app instance
app.run(host=FLASK_HOST, port=FLASK_PORT, threaded=False, processes=4)
server = pywsgi.WSGIServer((FLASK_HOST, FLASK_PORT), app)
server.serve_forever()
1 change: 1 addition & 0 deletions crawlab/config/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
MONGO_USERNAME = None
MONGO_PASSWORD = None
MONGO_DB = 'crawlab_test'
MONGO_AUTH_DB = 'crawlab_test'

# Celery中间者URL
BROKER_URL = 'redis://127.0.0.1:6379/0'
Expand Down
6 changes: 2 additions & 4 deletions crawlab/db/manager.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,8 @@
from bson import ObjectId
from mongoengine import connect
from pymongo import MongoClient, DESCENDING
from config import MONGO_HOST, MONGO_PORT, MONGO_DB, MONGO_USERNAME, MONGO_PASSWORD
from config import MONGO_HOST, MONGO_PORT, MONGO_DB, MONGO_USERNAME, MONGO_PASSWORD, MONGO_AUTH_DB
from utils import is_object_id

connect(db=MONGO_DB, host=MONGO_HOST, port=MONGO_PORT)


class DbManager(object):
__doc__ = """
Expand All @@ -17,6 +14,7 @@ def __init__(self):
port=MONGO_PORT,
username=MONGO_USERNAME,
password=MONGO_PASSWORD,
authSource=MONGO_AUTH_DB or MONGO_DB,
connect=False)
self.db = self.mongo[MONGO_DB]

Expand Down
83 changes: 8 additions & 75 deletions crawlab/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,80 +1,13 @@
aiohttp==3.5.4
amqp==2.4.2
aniso8601==6.0.0
Flask_CSV==1.2.0
gevent==1.4.0
requests==2.21.0
Scrapy==1.6.0
pymongo==3.7.2
APScheduler==3.6.0
asn1crypto==0.24.0
async-timeout==3.0.1
attrs==19.1.0
Automat==0.7.0
Babel==2.6.0
beautifulsoup4==4.7.1
billiard==3.6.0.0
bs4==0.0.1
bson==0.5.8
cachetools==3.1.0
celery==4.3.0
certifi==2019.3.9
cffi==1.12.3
chardet==3.0.4
Click==7.0
coloredlogs==10.0
constantly==15.1.0
cryptography==2.6.1
cssselect==1.0.3
csvalidate==1.1.1
eventlet==0.25.0
Flask_RESTful==0.3.7
Flask==1.0.2
Flask-APScheduler==1.11.0
Flask-Cors==3.0.7
Flask-CSV==1.2.0
Flask-RESTful==0.3.7
flask-restplus==0.12.1
flower==0.9.3
gevent==1.4.0
greenlet==0.4.15
gunicorn==19.9.0
html5lib==1.0.1
humanfriendly==4.18
hyperlink==19.0.0
idna==2.8
idna-ssl==1.1.0
incremental==17.5.0
itsdangerous==1.1.0
Jinja2==2.10
jsonpickle==1.1
jsonschema==3.0.1
kombu==4.5.0
lxml==4.3.3
MarkupSafe==1.1.1
marshmallow==2.19.2
mongoengine==0.17.0
multidict==4.5.2
parsel==1.5.1
pyasn1==0.4.5
pyasn1-modules==0.2.5
pycparser==2.19
PyDispatcher==2.0.5
PyHamcrest==1.9.0
pymongo==3.7.2
pyOpenSSL==19.0.0
pyrsistent==0.14.11
python-dateutil==2.8.0
pytz==2018.9
queuelib==1.5.0
redis==3.2.1
requests==2.21.0
Scrapy==1.6.0
service-identity==18.1.0
six==1.12.0
soupsieve==1.9.1
tornado==5.1.1
Twisted==19.2.0
typing-extensions==3.7.2
tzlocal==1.5.1
urllib3==1.24.1
vine==1.3.0
w3lib==1.20.0
webencodings==0.5.1
Flask_Cors==3.0.7
Werkzeug==0.15.2
yarl==1.3.0
zope.interface==4.6.0
eventlet
42 changes: 41 additions & 1 deletion crawlab/routes/spiders.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,13 +153,14 @@ def get(self, id=None, action=None):
if spider is None:
stats = get_file_suffix_stats(dir_path)
lang = get_lang_by_stats(stats)
spider = db_manager.save('spiders', {
spider_id = db_manager.save('spiders', {
'name': dir_name,
'src': dir_path,
'lang': lang,
'suffix_stats': stats,
'type': SpiderType.CUSTOMIZED
})
spider = db_manager.get('spiders', id=spider_id)

# existing spider
else:
Expand Down Expand Up @@ -214,11 +215,50 @@ def get(self, id=None, action=None):
items[i]['last_5_errors'] = get_last_n_run_errors_count(spider_id=spider['_id'], n=5)
items[i]['last_7d_tasks'] = get_last_n_day_tasks_count(spider_id=spider['_id'], n=5)

# sort spiders by _id descending
items = reversed(sorted(items, key=lambda x: x['_id']))

return {
'status': 'ok',
'items': jsonify(items)
}

def delete(self, id: str = None) -> (dict, tuple):
"""
DELETE method of given id for deleting an spider.
:param id:
:return:
"""
# get spider from db
spider = db_manager.get(col_name=self.col_name, id=id)

# delete spider folder
if spider.get('type') == SpiderType.CUSTOMIZED:
try:
shutil.rmtree(os.path.abspath(os.path.join(PROJECT_SOURCE_FILE_FOLDER, spider['src'])))
except Exception as err:
return {
'status': 'ok',
'error': str(err)
}, 500

# perform delete action
db_manager.remove_one(col_name=self.col_name, id=id)

# remove related tasks
db_manager.remove(col_name='tasks', cond={'spider_id': spider['_id']})

# remove related schedules
db_manager.remove(col_name='schedules', cond={'spider_id': spider['_id']})

# execute after_update hook
self.after_update(id)

return {
'status': 'ok',
'message': 'deleted successfully',
}

def crawl(self, id: str) -> (dict, tuple):
"""
Submit an HTTP request to start a crawl task in the node of given spider_id.
Expand Down

0 comments on commit f14b896

Please sign in to comment.