Skip to content

Commit

Permalink
use self.logger in all of the spiders (inspirehep#183)
Browse files Browse the repository at this point in the history
This is to unify logging mechanisms used in all of the
spiders. Previously some spiders used `self.log`.

In some cases (spiders.common.*) spiders create the
logger independently via `logging.getLogger`. I am
leaving this, as it provides additional debugging
information to know that the message comes from
the superclass.

Signed-off-by: Szymon Łopaciuk <[email protected]>
  • Loading branch information
szymonlopaciuk committed Jan 30, 2018
1 parent 25a441b commit f846388
Show file tree
Hide file tree
Showing 3 changed files with 16 additions and 14 deletions.
24 changes: 13 additions & 11 deletions hepcrawl/spiders/desy_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,9 @@ def crawl_local_directory(self):

for file_name in xml_file_names:
file_path = os.path.join(self.source_folder, file_name)
self.log('Local: Try to crawl local file: {0}'.format(file_path))
self.logger.info(
'Local: Try to crawl local file: {0}'.format(file_path)
)
yield Request(
'file://{0}'.format(file_path),
callback=self.parse,
Expand All @@ -133,7 +135,7 @@ def crawl_ftp_directory(self):
xml_remote_files_paths = self._filter_xml_files(remote_files_paths)

for remote_file in xml_remote_files_paths:
self.log(
self.logger.info(
'Remote: Try to crawl file from FTP: {0}'.format(remote_file),
)
remote_file = str(remote_file)
Expand All @@ -158,7 +160,7 @@ def handle_package_ftp(self, response):
response(hepcrawl.http.response.Response): response containing the
information about the ftp file download.
"""
self.log('Visited url {}'.format(response.url))
self.logger.info('Visited url {}'.format(response.url))
file_path = response.body
yield Request(
'file://{0}'.format(file_path),
Expand Down Expand Up @@ -207,8 +209,8 @@ def parse(self, response):
"""Parse a ``Desy`` XML file into a :class:`hepcrawl.utils.ParsedItem`.
"""

self.log('Got record from url/path: {0}'.format(response.url))
self.log('FTP enabled: {0}'.format(self.ftp_enabled))
self.logger.info('Got record from url/path: {0}'.format(response.url))
self.logger.info('FTP enabled: {0}'.format(self.ftp_enabled))
ftp_params = None

if self.ftp_enabled:
Expand All @@ -225,12 +227,12 @@ def parse(self, response):
url_schema = 'file'
hostname = None

self.log('Getting marc xml records...')
self.logger.info('Getting marc xml records...')
marcxml_records = self._get_marcxml_records(response.body)
self.log('Got %d marc xml records' % len(marcxml_records))
self.log('Getting hep records...')
self.logger.info('Got %d marc xml records' % len(marcxml_records))
self.logger.info('Getting hep records...')
hep_records = self._hep_records_from_marcxml(marcxml_records)
self.log('Got %d hep records' % len(hep_records))
self.logger.info('Got %d hep records' % len(hep_records))

for hep_record in hep_records:
files_to_download = [
Expand All @@ -244,7 +246,7 @@ def parse(self, response):
if self._has_to_be_downloaded(document['url'])
]

self.log(
self.logger.info(
'Got the following attached documents to download: %s'
% files_to_download
)
Expand All @@ -254,7 +256,7 @@ def parse(self, response):
ftp_params=ftp_params,
record_format='hep',
)
self.log('Got item: %s' % parsed_item)
self.logger.info('Got item: %s' % parsed_item)

yield parsed_item

Expand Down
2 changes: 1 addition & 1 deletion hepcrawl/spiders/elsevier_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ def handle_feed(self, response):

def handle_package(self, response):
"""Handle the zip package and yield a request for every XML found."""
self.log("Visited %s" % response.url)
self.logger.info("Visited %s" % response.url)
filename = os.path.basename(response.url).rstrip(".zip")
# TMP dir to extract zip packages:
target_folder = mkdtemp(prefix="elsevier_" + filename + "_", dir="/tmp/")
Expand Down
4 changes: 2 additions & 2 deletions hepcrawl/spiders/pos_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ def start_requests(self):
yield Request(self.source_file)

def parse(self, response):
self.log('Got record from: {response.url}'.format(**vars()))
self.logger.info('Got record from: {response.url}'.format(**vars()))

response.selector.remove_namespaces()
record_xml_selectors = response.selector.xpath('.//record')
Expand Down Expand Up @@ -125,7 +125,7 @@ def get_conference_paper_page_request(self, xml_selector, meta=None):
)

def parse_conference_paper(self, response):
self.log(
self.logger.info(
'Parsing conference paper from: {response.url}'.format(**vars())
)
xml_record = response.meta.get('xml_record')
Expand Down

0 comments on commit f846388

Please sign in to comment.