Skip to content

Commit

Permalink
Merge pull request #46 from tikazyq/develop
Browse files Browse the repository at this point in the history
Develop
  • Loading branch information
Marvin Zhang authored May 30, 2019
2 parents 93e3752 + fb11a14 commit 3981317
Show file tree
Hide file tree
Showing 8 changed files with 88 additions and 13 deletions.
28 changes: 28 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# 0.2.2 (unreleased)
### Features / Enhancement
- **Automatic Extract Fields**: Automatically extracting data fields in list pages for configurable spider.
- **Download Results**: Allow downloading results as csv file.
- **Baidu Tongji**: Allow users to choose to report usage info to Baidu Tongji.

### Bug Fixes
- **Results Page Pagination**: Fixes so the pagination of results page is working correctly. [#45](https://github.com/tikazyq/crawlab/issues/45)
- **Schedule Tasks Duplicated Triggers**: Set Flask DEBUG as False so that schedule tasks won't trigger twice. [#32](https://github.com/tikazyq/crawlab/issues/32)
- **Frontend Environment**: Added `VUE_APP_BASE_URL` as production mode environment variable so the API call won't be always `localhost` in deployed env [#30](https://github.com/tikazyq/crawlab/issues/30)

# 0.2.1 (2019-05-27)
- **Configurable Spider**: Allow users to create a spider to crawl data without coding.

# 0.2 (2019-05-10)

- **Advanced Stats**: Advanced analytics in spider detail view.
- **Sites Data**: Added sites list (China) for users to check info such as robots.txt and home page response time/code.

# 0.1.1 (2019-04-23)

- **Basic Stats**: User can view basic stats such as number of failed tasks and number of results in spiders and tasks pages.
- **Near Realtime Task Info**: Periodically (5 sec) polling data from server to allow view task info in a near-realtime fashion.
- **Scheduled Tasks**: Allow users to set up cron-like scheduled/periodical tasks using apscheduler.

# 0.1 (2019-04-17)

- **Initial Release**
4 changes: 4 additions & 0 deletions crawlab/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ Babel==2.6.0
beautifulsoup4==4.7.1
billiard==3.6.0.0
bs4==0.0.1
bson==0.5.8
cachetools==3.1.0
celery==4.3.0
certifi==2019.3.9
Expand All @@ -20,9 +21,11 @@ coloredlogs==10.0
constantly==15.1.0
cryptography==2.6.1
cssselect==1.0.3
csvalidate==1.1.1
Flask==1.0.2
Flask-APScheduler==1.11.0
Flask-Cors==3.0.7
Flask-CSV==1.2.0
Flask-RESTful==0.3.7
flask-restplus==0.12.1
flower==0.9.3
Expand All @@ -42,6 +45,7 @@ jsonschema==3.0.1
kombu==4.5.0
lxml==4.3.3
MarkupSafe==1.1.1
marshmallow==2.19.2
mongoengine==0.17.0
multidict==4.5.2
parsel==1.5.1
Expand Down
18 changes: 17 additions & 1 deletion crawlab/routes/tasks.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
import json
import os
import sys
from time import time

from flask_csv import send_csv

try:
from _signal import SIGKILL
Expand Down Expand Up @@ -178,7 +181,7 @@ def get_results(self, id: str) -> (dict, tuple):
if not col_name:
return []
fields = get_spider_col_fields(col_name)
items = db_manager.list(col_name, {'task_id': id})
items = db_manager.list(col_name, {'task_id': id}, skip=page_size * (page_num - 1), limit=page_size)
return {
'status': 'ok',
'fields': jsonify(fields),
Expand Down Expand Up @@ -213,3 +216,16 @@ def stop(self, id):
'id': id,
'status': 'ok',
}

def download_results(self, id: str):
task = db_manager.get('tasks', id=id)
spider = db_manager.get('spiders', id=task['spider_id'])
col_name = spider.get('col')
if not col_name:
return send_csv([], f'results_{col_name}_{round(time())}.csv')
items = db_manager.list(col_name, {'task_id': id}, limit=999999999)
fields = get_spider_col_fields(col_name, task_id=id, limit=999999999)
return send_csv(items,
filename=f'results_{col_name}_{round(time())}.csv',
fields=fields,
encoding='utf-8')
9 changes: 7 additions & 2 deletions crawlab/utils/spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,12 +41,17 @@ def get_spider_type(path: str) -> SpiderType:
return SpiderType.SCRAPY


def get_spider_col_fields(col_name: str) -> list:
def get_spider_col_fields(col_name: str, task_id: str = None, limit: int = 100) -> list:
"""
Get spider collection fields
:param col_name: collection name
:param task_id: task_id
:param limit: limit
"""
items = db_manager.list(col_name, {}, limit=100, sort_key='_id')
filter_ = {}
if task_id is not None:
filter_['task_id'] = task_id
items = db_manager.list(col_name, filter_, limit=limit, sort_key='_id')
fields = set()
for item in items:
for k in item.keys():
Expand Down
16 changes: 8 additions & 8 deletions frontend/src/components/TableView/GeneralTableView.vue
Original file line number Diff line number Diff line change
Expand Up @@ -58,18 +58,18 @@ export default {
computed: {
filteredData () {
return this.data
.map(d => d)
.filter((d, index) => {
// pagination
const pageNum = this.pageNum
const pageSize = this.pageSize
return (pageSize * (pageNum - 1) <= index) && (index < pageSize * pageNum)
})
// .map(d => d)
// .filter((d, index) => {
// // pagination
// const pageNum = this.pageNum
// const pageSize = this.pageSize
// return (pageSize * (pageNum - 1) <= index) && (index < pageSize * pageNum)
// })
}
},
methods: {
onPageChange () {
this.$emit('page-change')
this.$emit('page-change', { pageNum: this.pageNum, pageSize: this.pageSize })
}
}
}
Expand Down
2 changes: 2 additions & 0 deletions frontend/src/i18n/zh.js
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,8 @@ export default {
'Stop': '停止',
'Preview': '预览',
'Extract Fields': '提取字段',
'Download': '下载',
'Download CSV': '下载CSV',

// 主页
'Total Tasks': '总任务数',
Expand Down
2 changes: 1 addition & 1 deletion frontend/src/store/modules/task.js
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ const state = {
pageNum: 0,
pageSize: 10,
// results
resultsPageNum: 0,
resultsPageNum: 1,
resultsPageSize: 10
}

Expand Down
22 changes: 21 additions & 1 deletion frontend/src/views/task/TaskDetail.vue
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,17 @@
</el-card>
</el-tab-pane>
<el-tab-pane :label="$t('Results')" name="results">
<div class="button-group">
<el-button type="primary" icon="el-icon-download" @click="downloadCSV">
{{$t('Download CSV')}}
</el-button>
</div>
<general-table-view :data="taskResultsData"
:columns="taskResultsColumns"
:page-num="resultsPageNum"
:page-size="resultsPageSize"
:total="taskResultsTotalCount"/>
:total="taskResultsTotalCount"
@page-change="onResultsPageChange"/>
</el-tab-pane>
</el-tabs>
</div>
Expand Down Expand Up @@ -78,6 +84,15 @@ export default {
},
onSpiderChange (id) {
this.$router.push(`/spiders/${id}`)
},
onResultsPageChange (payload) {
const { pageNum, pageSize } = payload
this.resultsPageNum = pageNum
this.resultsPageSize = pageSize
this.$store.dispatch('task/getTaskResults', this.$route.params.id)
},
downloadCSV () {
window.location.href = this.$request.baseUrl + '/tasks/' + this.$route.params.id + '/download_results'
}
},
created () {
Expand Down Expand Up @@ -114,4 +129,9 @@ export default {
overflow-x: auto;
overflow-y: auto;
}
.button-group {
margin-bottom: 10px;
text-align: right;
}
</style>

0 comments on commit 3981317

Please sign in to comment.