Skip to content

Commit

Permalink
CDS spider: drop HarvestingKit (inspirehep#199)
Browse files Browse the repository at this point in the history
Harvests CDS through dojson directly: closes inspirehep#199.

Signed-off-by: Szymon Łopaciuk <[email protected]>
  • Loading branch information
szymonlopaciuk committed Dec 12, 2017
1 parent 7e7afbb commit c6fbed0
Show file tree
Hide file tree
Showing 2 changed files with 683 additions and 432 deletions.
42 changes: 12 additions & 30 deletions hepcrawl/spiders/cds_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,7 @@
"""Spider for the CERN Document Server OAI-PMH interface"""

import logging
from scrapy import Request
from flask.app import Flask
from harvestingkit.inspire_cds_package.from_cds import CDS2Inspire
from harvestingkit.bibrecord import (
create_record as create_bibrec,
record_xml_output,
)
from dojson.contrib.marc21.utils import create_record
from inspire_dojson.hep import hep

Expand All @@ -34,10 +28,8 @@ class CDSSpider(OAIPMHSpider):
$ scrapy crawl CDS \\
-a "oai_set=forINSPIRE" -a "from_date=2017-10-10"
It uses `HarvestingKit <https://pypi.python.org/pypi/HarvestingKit>`_ to
translate from CDS's MARCXML into INSPIRE Legacy's MARCXML flavor. It then
employs `inspire-dojson <https://pypi.python.org/pypi/inspire-dojson>`_ to
transform the legacy INSPIRE MARCXML into the new INSPIRE Schema.
It uses `inspire-dojson <https://pypi.python.org/pypi/inspire-dojson>`_ to
translate from CDS's MARCXML into the new INSPIRE Schema.
"""

name = 'CDS'
Expand All @@ -57,23 +49,13 @@ def __init__(self,

def parse_record(self, selector):
selector.remove_namespaces()
try:
cds_bibrec, ok, errs = create_bibrec(selector.xpath('.//record').extract()[0])
if not ok:
raise RuntimeError("Cannot parse record %s: %s", selector, errs)
self.logger.info("Here's the record: %s" % cds_bibrec)
inspire_bibrec = CDS2Inspire(cds_bibrec).get_record()
marcxml_record = record_xml_output(inspire_bibrec)
record = create_record(marcxml_record)
app = Flask('hepcrawl')
app.config.update(
self.settings.getdict('MARC_TO_HEP_SETTINGS', {})
)
with app.app_context():
json_record = hep.do(record)
base_uri = self.settings['SCHEMA_BASE_URI']
json_record['$schema'] = base_uri + 'hep.json'
return ParsedItem(record=json_record, record_format='hep')
except Exception:
logger.exception("Error when parsing record")
return None
record = create_record(selector.xpath('.//record').extract()[0])
app = Flask('hepcrawl')
app.config.update(
self.settings.getdict('MARC_TO_HEP_SETTINGS', {})
)
with app.app_context():
json_record = hep.do(record)
base_uri = self.settings['SCHEMA_BASE_URI']
json_record['$schema'] = base_uri + 'hep.json'
return ParsedItem(record=json_record, record_format='hep')
Loading

0 comments on commit c6fbed0

Please sign in to comment.