diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 000000000..056d24d8d --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,28 @@ +# 0.2.2 (unreleased) +### Features / Enhancement +- **Automatic Extract Fields**: Automatically extracting data fields in list pages for configurable spider. +- **Download Results**: Allow downloading results as csv file. +- **Baidu Tongji**: Allow users to choose to report usage info to Baidu Tongji. + +### Bug Fixes +- **Results Page Pagination**: Fixes so the pagination of results page is working correctly. [#45](https://github.com/tikazyq/crawlab/issues/45) +- **Schedule Tasks Duplicated Triggers**: Set Flask DEBUG as False so that schedule tasks won't trigger twice. [#32](https://github.com/tikazyq/crawlab/issues/32) +- **Frontend Environment**: Added `VUE_APP_BASE_URL` as production mode environment variable so the API call won't be always `localhost` in deployed env [#30](https://github.com/tikazyq/crawlab/issues/30) + +# 0.2.1 (2019-05-27) +- **Configurable Spider**: Allow users to create a spider to crawl data without coding. + +# 0.2 (2019-05-10) + +- **Advanced Stats**: Advanced analytics in spider detail view. +- **Sites Data**: Added sites list (China) for users to check info such as robots.txt and home page response time/code. + +# 0.1.1 (2019-04-23) + +- **Basic Stats**: User can view basic stats such as number of failed tasks and number of results in spiders and tasks pages. +- **Near Realtime Task Info**: Periodically (5 sec) polling data from server to allow view task info in a near-realtime fashion. +- **Scheduled Tasks**: Allow users to set up cron-like scheduled/periodical tasks using apscheduler. + +# 0.1 (2019-04-17) + +- **Initial Release** diff --git a/crawlab/requirements.txt b/crawlab/requirements.txt index 0a1b7f137..e9559317d 100644 --- a/crawlab/requirements.txt +++ b/crawlab/requirements.txt @@ -10,6 +10,7 @@ Babel==2.6.0 beautifulsoup4==4.7.1 billiard==3.6.0.0 bs4==0.0.1 +bson==0.5.8 cachetools==3.1.0 celery==4.3.0 certifi==2019.3.9 @@ -20,9 +21,11 @@ coloredlogs==10.0 constantly==15.1.0 cryptography==2.6.1 cssselect==1.0.3 +csvalidate==1.1.1 Flask==1.0.2 Flask-APScheduler==1.11.0 Flask-Cors==3.0.7 +Flask-CSV==1.2.0 Flask-RESTful==0.3.7 flask-restplus==0.12.1 flower==0.9.3 @@ -42,6 +45,7 @@ jsonschema==3.0.1 kombu==4.5.0 lxml==4.3.3 MarkupSafe==1.1.1 +marshmallow==2.19.2 mongoengine==0.17.0 multidict==4.5.2 parsel==1.5.1 diff --git a/crawlab/routes/tasks.py b/crawlab/routes/tasks.py index 5ae7648bf..a318803bc 100644 --- a/crawlab/routes/tasks.py +++ b/crawlab/routes/tasks.py @@ -1,6 +1,9 @@ import json import os import sys +from time import time + +from flask_csv import send_csv try: from _signal import SIGKILL @@ -178,7 +181,7 @@ def get_results(self, id: str) -> (dict, tuple): if not col_name: return [] fields = get_spider_col_fields(col_name) - items = db_manager.list(col_name, {'task_id': id}) + items = db_manager.list(col_name, {'task_id': id}, skip=page_size * (page_num - 1), limit=page_size) return { 'status': 'ok', 'fields': jsonify(fields), @@ -213,3 +216,16 @@ def stop(self, id): 'id': id, 'status': 'ok', } + + def download_results(self, id: str): + task = db_manager.get('tasks', id=id) + spider = db_manager.get('spiders', id=task['spider_id']) + col_name = spider.get('col') + if not col_name: + return send_csv([], f'results_{col_name}_{round(time())}.csv') + items = db_manager.list(col_name, {'task_id': id}, limit=999999999) + fields = get_spider_col_fields(col_name, task_id=id, limit=999999999) + return send_csv(items, + filename=f'results_{col_name}_{round(time())}.csv', + fields=fields, + encoding='utf-8') diff --git a/crawlab/utils/spider.py b/crawlab/utils/spider.py index 9a2b48df3..d89950285 100644 --- a/crawlab/utils/spider.py +++ b/crawlab/utils/spider.py @@ -41,12 +41,17 @@ def get_spider_type(path: str) -> SpiderType: return SpiderType.SCRAPY -def get_spider_col_fields(col_name: str) -> list: +def get_spider_col_fields(col_name: str, task_id: str = None, limit: int = 100) -> list: """ Get spider collection fields :param col_name: collection name + :param task_id: task_id + :param limit: limit """ - items = db_manager.list(col_name, {}, limit=100, sort_key='_id') + filter_ = {} + if task_id is not None: + filter_['task_id'] = task_id + items = db_manager.list(col_name, filter_, limit=limit, sort_key='_id') fields = set() for item in items: for k in item.keys(): diff --git a/frontend/src/components/TableView/GeneralTableView.vue b/frontend/src/components/TableView/GeneralTableView.vue index 97bd3b381..25cd1923c 100644 --- a/frontend/src/components/TableView/GeneralTableView.vue +++ b/frontend/src/components/TableView/GeneralTableView.vue @@ -58,18 +58,18 @@ export default { computed: { filteredData () { return this.data - .map(d => d) - .filter((d, index) => { - // pagination - const pageNum = this.pageNum - const pageSize = this.pageSize - return (pageSize * (pageNum - 1) <= index) && (index < pageSize * pageNum) - }) + // .map(d => d) + // .filter((d, index) => { + // // pagination + // const pageNum = this.pageNum + // const pageSize = this.pageSize + // return (pageSize * (pageNum - 1) <= index) && (index < pageSize * pageNum) + // }) } }, methods: { onPageChange () { - this.$emit('page-change') + this.$emit('page-change', { pageNum: this.pageNum, pageSize: this.pageSize }) } } } diff --git a/frontend/src/i18n/zh.js b/frontend/src/i18n/zh.js index 35d7eaf89..18813c3a0 100644 --- a/frontend/src/i18n/zh.js +++ b/frontend/src/i18n/zh.js @@ -56,6 +56,8 @@ export default { 'Stop': '停止', 'Preview': '预览', 'Extract Fields': '提取字段', + 'Download': '下载', + 'Download CSV': '下载CSV', // 主页 'Total Tasks': '总任务数', diff --git a/frontend/src/store/modules/task.js b/frontend/src/store/modules/task.js index 02a238411..906f9f699 100644 --- a/frontend/src/store/modules/task.js +++ b/frontend/src/store/modules/task.js @@ -18,7 +18,7 @@ const state = { pageNum: 0, pageSize: 10, // results - resultsPageNum: 0, + resultsPageNum: 1, resultsPageSize: 10 } diff --git a/frontend/src/views/task/TaskDetail.vue b/frontend/src/views/task/TaskDetail.vue index 601ba5004..cf3bff21b 100644 --- a/frontend/src/views/task/TaskDetail.vue +++ b/frontend/src/views/task/TaskDetail.vue @@ -15,11 +15,17 @@ +
+ + {{$t('Download CSV')}} + +
+ :total="taskResultsTotalCount" + @page-change="onResultsPageChange"/>
@@ -78,6 +84,15 @@ export default { }, onSpiderChange (id) { this.$router.push(`/spiders/${id}`) + }, + onResultsPageChange (payload) { + const { pageNum, pageSize } = payload + this.resultsPageNum = pageNum + this.resultsPageSize = pageSize + this.$store.dispatch('task/getTaskResults', this.$route.params.id) + }, + downloadCSV () { + window.location.href = this.$request.baseUrl + '/tasks/' + this.$route.params.id + '/download_results' } }, created () { @@ -114,4 +129,9 @@ export default { overflow-x: auto; overflow-y: auto; } + + .button-group { + margin-bottom: 10px; + text-align: right; + }