Merge pull request #75 from tikazyq/develop

Develop
crawlab-team · Jun 29, 2019 · 839dfca · 839dfca
2 parents c913f9d + d66f4d4
commit 839dfca
Show file tree

Hide file tree

Showing 17 changed files with 172 additions and 93 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,7 +2,6 @@
 ### Features / Enhancement
 - **Documentation**: Better and much more detailed documentation.
 - **Better Crontab**: Make crontab expression through crontab UI.
-- **High Concurrency**: `gevent` + `flask` to support high concurrency. [#70](https://github.com/tikazyq/crawlab/issues/70)
 
 ### Bugs Fixes
 - **Deleting Spider**. Deleting a spider does not only remove record in db but also removing related folder, tasks and schedules. [#69](https://github.com/tikazyq/crawlab/issues/69)

diff --git a/Dockerfile b/Dockerfile
@@ -35,8 +35,8 @@ RUN npm install -g yarn \
 	&& yarn install
 
 # install backend
-RUN pip install -U setuptools \
-	&& pip install -r /opt/crawlab/crawlab/requirements.txt
+RUN pip install -U setuptools -i https://pypi.tuna.tsinghua.edu.cn/simple \
+	&& pip install -r /opt/crawlab/crawlab/requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
 
 # start backend
 EXPOSE 8080

diff --git a/Jenkinsfile b/Jenkinsfile
@@ -24,7 +24,7 @@ pipeline {
             steps {
                 echo "Building..."
                 sh """
-                docker build -t crawlab:latest .
+                docker build -t tikazyq/crawlab:latest .
                 """
             }
         }
@@ -37,12 +37,13 @@ pipeline {
             steps {
                 echo 'Deploying....'
                 sh """
-                docker stop crawlab | true
-                docker run -d --rm --restart always --name crawlab \
+                docker rm -f crawlab | true
+                docker run -d --rm --name crawlab \
                     -p 8080:8080 \
                     -p 8000:8000 \
                     -v /home/yeqing/.env.production:/opt/crawlab/frontend/.env.production \
-                    -v /home/yeqing/config.py:/opt/crawlab/crawlab/config/config.py
+                    -v /home/yeqing/config.py:/opt/crawlab/crawlab/config/config.py \
+                    tikazyq/crawlab master
                 """
             }
         }

diff --git a/crawlab/requirements.txt b/crawlab/requirements.txt
@@ -13,3 +13,4 @@ Werkzeug==0.15.2
 eventlet
 Celery
 Flower
+redis
diff --git a/crawlab/tasks/spider.py b/crawlab/tasks/spider.py
@@ -2,6 +2,7 @@
 import sys
 from datetime import datetime
 from time import sleep
+import traceback
 
 from bson import ObjectId
 from pymongo import ASCENDING, DESCENDING
@@ -213,8 +214,10 @@ def execute_config_spider(self, id: str, params: str = None):
     env['MONGO_HOST'] = MONGO_HOST
     env['MONGO_PORT'] = str(MONGO_PORT)
     env['MONGO_DB'] = MONGO_DB
-    env['MONGO_USERNAME'] = MONGO_USERNAME
-    env['MONGO_PASSWORD'] = MONGO_PASSWORD
+    if MONGO_USERNAME is not None:
+        env['MONGO_USERNAME'] = MONGO_USERNAME
+    if MONGO_PASSWORD:
+        env['MONGO_PASSWORD'] = MONGO_PASSWORD
 
     cmd_arr = [
         sys.executable,
@@ -246,6 +249,7 @@ def execute_config_spider(self, id: str, params: str = None):
         else:
             status = TaskStatus.FAILURE
     except Exception as err:
+        traceback.print_exc()
         logger.error(err)
         stderr.write(str(err))
         status = TaskStatus.FAILURE

diff --git a/spiders/example_juejin/juejin/items.py b/spiders/example_juejin/juejin/items.py
diff --git a/spiders/example_juejin/juejin/pipelines.py b/spiders/example_juejin/juejin/pipelines.py
diff --git a/spiders/example_juejin/juejin/spiders/juejin_spider.py b/spiders/example_juejin/juejin/spiders/juejin_spider.py
diff --git a/spiders/example_juejin/start.py b/spiders/example_juejin/start.py
diff --git a/spiders/example_juejin/scrapy.cfg → spiders/xueqiu/scrapy.cfg b/spiders/example_juejin/scrapy.cfg → spiders/xueqiu/scrapy.cfg
@@ -1,11 +1,11 @@
 # Automatically created by: scrapy startproject
 #
 # For more information about the [deploy] section see:
-# https://scrapyd.readthedocs.org/en/latest/deploy.html
+# https://scrapyd.readthedocs.io/en/latest/deploy.html
 
 [settings]
-default = juejin.settings
+default = xueqiu.settings
 
 [deploy]
 #url = http://localhost:6800/
-project = juejin
+project = xueqiu
diff --git a/spiders/example_juejin/juejin/__init__.py → spiders/xueqiu/xueqiu/__init__.py b/spiders/example_juejin/juejin/__init__.py → spiders/xueqiu/xueqiu/__init__.py
diff --git a/spiders/xueqiu/xueqiu/items.py b/spiders/xueqiu/xueqiu/items.py
@@ -0,0 +1,20 @@
+# -*- coding: utf-8 -*-
+
+# Define here the models for your scraped items
+#
+# See documentation in:
+# https://doc.scrapy.org/en/latest/topics/items.html
+
+import scrapy
+
+
+class XueqiuItem(scrapy.Item):
+    # define the fields for your item here like:
+    _id = scrapy.Field()
+    task_id = scrapy.Field()
+    id = scrapy.Field()
+    text = scrapy.Field()
+    target = scrapy.Field()
+    view_count = scrapy.Field()
+    mark = scrapy.Field()
+    created_at = scrapy.Field()
diff --git a/spiders/example_juejin/juejin/middlewares.py → spiders/xueqiu/xueqiu/middlewares.py b/spiders/example_juejin/juejin/middlewares.py → spiders/xueqiu/xueqiu/middlewares.py
@@ -3,12 +3,12 @@
 # Define here the models for your spider middleware
 #
 # See documentation in:
-# http://doc.scrapy.org/en/latest/topics/spider-middleware.html
+# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
 
 from scrapy import signals
 
 
-class JuejinSpiderMiddleware(object):
+class XueqiuSpiderMiddleware(object):
     # Not all methods need to be defined. If a method is not defined,
     # scrapy acts as if the spider middleware does not modify the
     # passed objects.
@@ -54,3 +54,50 @@ def process_start_requests(self, start_requests, spider):
 
     def spider_opened(self, spider):
         spider.logger.info('Spider opened: %s' % spider.name)
+
+
+class XueqiuDownloaderMiddleware(object):
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the downloader middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_request(self, request, spider):
+        # Called for each request that goes through the downloader
+        # middleware.
+
+        # Must either:
+        # - return None: continue processing this request
+        # - or return a Response object
+        # - or return a Request object
+        # - or raise IgnoreRequest: process_exception() methods of
+        #   installed downloader middleware will be called
+        return None
+
+    def process_response(self, request, response, spider):
+        # Called with the response returned from the downloader.
+
+        # Must either;
+        # - return a Response object
+        # - return a Request object
+        # - or raise IgnoreRequest
+        return response
+
+    def process_exception(self, request, exception, spider):
+        # Called when a download handler or a process_request()
+        # (from other downloader middleware) raises an exception.
+
+        # Must either:
+        # - return None: continue processing this exception
+        # - return a Response object: stops process_exception() chain
+        # - return a Request object: stops process_exception() chain
+        pass
+
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
diff --git a/spiders/xueqiu/xueqiu/pipelines.py b/spiders/xueqiu/xueqiu/pipelines.py
@@ -0,0 +1,25 @@
+# -*- coding: utf-8 -*-
+
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
+import os
+
+from pymongo import MongoClient
+
+
+class XueqiuPipeline(object):
+    mongo = MongoClient(
+        host=os.environ.get('MONGO_HOST') or 'localhost',
+        port=int(os.environ.get('MONGO_PORT') or 27017)
+    )
+    db = mongo[os.environ.get('MONGO_DB') or 'crawlab_test']
+    col = db.get_collection(os.environ.get('CRAWLAB_COLLECTION') or 'results_xueqiu')
+
+    def process_item(self, item, spider):
+        item['task_id'] = os.environ.get('CRAWLAB_TASK_ID')
+        item['_id'] = item['id']
+        if self.col.find_one({'_id': item['_id']}) is None:
+            self.col.save(item)
+            return item
diff --git a/spiders/example_juejin/juejin/settings.py → spiders/xueqiu/xueqiu/settings.py b/spiders/example_juejin/juejin/settings.py → spiders/xueqiu/xueqiu/settings.py
@@ -1,21 +1,21 @@
 # -*- coding: utf-8 -*-
 
-# Scrapy settings for juejin project
+# Scrapy settings for xueqiu project
 #
 # For simplicity, this file contains only settings considered important or
 # commonly used. You can find more settings consulting the documentation:
 #
-#     http://doc.scrapy.org/en/latest/topics/settings.html
-#     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
-#     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
+#     https://doc.scrapy.org/en/latest/topics/settings.html
+#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
+#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
 
-BOT_NAME = 'juejin'
+BOT_NAME = 'xueqiu'
 
-SPIDER_MODULES = ['juejin.spiders']
-NEWSPIDER_MODULE = 'juejin.spiders'
+SPIDER_MODULES = ['xueqiu.spiders']
+NEWSPIDER_MODULE = 'xueqiu.spiders'
 
 # Crawl responsibly by identifying yourself (and your website) on the user-agent
-USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
+USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
 
 # Obey robots.txt rules
 ROBOTSTXT_OBEY = True
@@ -24,7 +24,7 @@
 # CONCURRENT_REQUESTS = 32
 
 # Configure a delay for requests for the same website (default: 0)
-# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
+# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
 # DOWNLOAD_DELAY = 3
 # The download delay setting will honor only one of:
@@ -44,31 +44,31 @@
 # }
 
 # Enable or disable spider middlewares
-# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
+# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
 # SPIDER_MIDDLEWARES = {
-#    'juejin.middlewares.JuejinSpiderMiddleware': 543,
+#    'xueqiu.middlewares.XueqiuSpiderMiddleware': 543,
 # }
 
 # Enable or disable downloader middlewares
-# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
+# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
 # DOWNLOADER_MIDDLEWARES = {
-#    'juejin.middlewares.MyCustomDownloaderMiddleware': 543,
+#    'xueqiu.middlewares.XueqiuDownloaderMiddleware': 543,
 # }
 
 # Enable or disable extensions
-# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
+# See https://doc.scrapy.org/en/latest/topics/extensions.html
 # EXTENSIONS = {
 #    'scrapy.extensions.telnet.TelnetConsole': None,
 # }
 
 # Configure item pipelines
-# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
+# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 ITEM_PIPELINES = {
-    'juejin.pipelines.JuejinPipeline': 300,
+    'xueqiu.pipelines.XueqiuPipeline': 300,
 }
 
 # Enable and configure the AutoThrottle extension (disabled by default)
-# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
+# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
 # AUTOTHROTTLE_ENABLED = True
 # The initial download delay
 # AUTOTHROTTLE_START_DELAY = 5
@@ -81,7 +81,7 @@
 # AUTOTHROTTLE_DEBUG = False
 
 # Enable and configure HTTP caching (disabled by default)
-# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
 # HTTPCACHE_ENABLED = True
 # HTTPCACHE_EXPIRATION_SECS = 0
 # HTTPCACHE_DIR = 'httpcache'

diff --git a/...example_juejin/juejin/spiders/__init__.py → spiders/xueqiu/xueqiu/spiders/__init__.py b/...example_juejin/juejin/spiders/__init__.py → spiders/xueqiu/xueqiu/spiders/__init__.py
-Original file line number
+Diff line change
@@ Expand Up / @@ -13,3 +13,4 @@ Werkzeug==0.15.2 @@
     eventlet
     Celery
     Flower
+    redis