From d09d52af5f61381408f11240061b0c695e5fc944 Mon Sep 17 00:00:00 2001 From: Adilson Carvalho Date: Mon, 28 Nov 2016 19:28:13 -0200 Subject: [PATCH] Create a version number for the crawler (#30) It looks for a file for the class with a .version extension to be created by the CI and uses the class name + dev during development. * Created get_version method * Added a spider field to carry spider related data --- nfcrawler/nfcrawler/items.py | 6 ++++++ nfcrawler/nfcrawler/loaders.py | 3 +++ nfcrawler/nfcrawler/spiders/pr_nfce.py | 7 +++++++ nfcrawler/nfcrawler/utils.py | 14 ++++++++++++++ 4 files changed, 30 insertions(+) diff --git a/nfcrawler/nfcrawler/items.py b/nfcrawler/nfcrawler/items.py index 8a70945..108cef1 100644 --- a/nfcrawler/nfcrawler/items.py +++ b/nfcrawler/nfcrawler/items.py @@ -4,6 +4,11 @@ from scrapy.loader.processors import MapCompose, Identity from nfcrawler.utils import to_int, to_decimal, to_datetime, string_cleaner +# --- + +class SpiderItem(Item): + version = Field() + # --- class NFeEmitenteItem(Item): razao_social = Field() @@ -210,6 +215,7 @@ class ProdutoItem(Item): # --- class DocumentItem(Item): + spider = Field(serializer=SpiderItem) nfe = Field(serializer=NFeItem) emitente = Field(serializer=EmitenteItem) destinatario = Field(serializer=DestinatarioItem) diff --git a/nfcrawler/nfcrawler/loaders.py b/nfcrawler/nfcrawler/loaders.py index f98c6d1..c1d4fdd 100644 --- a/nfcrawler/nfcrawler/loaders.py +++ b/nfcrawler/nfcrawler/loaders.py @@ -15,6 +15,9 @@ class DocumentLoader(ItemLoader): class NFeLoader(CommonLoader): default_item_class = NFeItem +class SpiderLoader(CommonLoader): + default_item_class = SpiderItem + class NFeEmitenteLoader(CommonLoader): default_item_class = NFeEmitenteItem diff --git a/nfcrawler/nfcrawler/spiders/pr_nfce.py b/nfcrawler/nfcrawler/spiders/pr_nfce.py index f805a01..5194c66 100644 --- a/nfcrawler/nfcrawler/spiders/pr_nfce.py +++ b/nfcrawler/nfcrawler/spiders/pr_nfce.py @@ -3,6 +3,7 @@ import re from nfcrawler.loaders import * from nfcrawler.items import * +from nfcrawler.utils import get_version # Extracted documents at https://github.com/adilsoncarvalho/barateza-nfcrawler/wiki/NFCe-ParanĂ¡ @@ -36,6 +37,7 @@ def parse(self, response): def parse_detailed_page(self, response): loader = DocumentLoader(response=response) + loader.add_value('spider', self.get_spider(response)) loader.add_value('nfe', self.get_nfe(response)) loader.add_value('emitente', self.get_emitente(response)) loader.add_value('destinatario', self.get_destinatario(response)) @@ -45,6 +47,11 @@ def parse_detailed_page(self, response): loader.add_value('produtos', self.get_produtos(response)) return loader.load_item() + def get_spider(self, response): + loader = SpiderLoader(response=response) + loader.add_value('version', get_version(self)) + return loader.load_item() + def get_nfe(self, response): loader = NFeLoader(response=response) loader.add_css('chave_acesso', 'div.GeralXslt fieldset table.box tr:first-of-type td span') diff --git a/nfcrawler/nfcrawler/utils.py b/nfcrawler/nfcrawler/utils.py index 83692cc..a4dc8eb 100644 --- a/nfcrawler/nfcrawler/utils.py +++ b/nfcrawler/nfcrawler/utils.py @@ -1,5 +1,8 @@ # -*- coding: utf-8 -*- +import os import re +import subprocess +import inspect from unidecode import unidecode from w3lib.html import remove_tags @@ -44,3 +47,14 @@ def save_to_file(self, file_name, content): with open(file_name, 'w') as file: file.write(content) + +def get_version(self): + version_filename = get_version_file(self) + if os.path.exists(version_filename): + version = open(version_filename, 'r').read().strip() + else: + version = 'dev' + return self.name + '/' + version + +def get_version_file(self): + return os.path.splitext(inspect.getfile(self.__class__))[0] + '.version'