Skip to content

Commit

Permalink
IOP Spider: make use of NLM parser
Browse files Browse the repository at this point in the history
Signed-off-by: Szymon Łopaciuk <[email protected]>
  • Loading branch information
szymonlopaciuk committed Jan 17, 2018
1 parent 8face53 commit 75cc606
Show file tree
Hide file tree
Showing 7 changed files with 86 additions and 523 deletions.
158 changes: 0 additions & 158 deletions hepcrawl/extractors/nlm.py

This file was deleted.

92 changes: 29 additions & 63 deletions hepcrawl/spiders/iop_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,11 @@
from scrapy.spiders import XMLFeedSpider

from .common import StatefulSpider
from ..extractors.nlm import NLM
from ..items import HEPRecord
from ..loaders import HEPLoader
from ..parsers import NLMParser
from ..utils import ParsedItem


class IOPSpider(StatefulSpider, XMLFeedSpider, NLM):
class IOPSpider(StatefulSpider, XMLFeedSpider):
"""IOPSpider crawler.
This spider should first be able to harvest files from `IOP STACKS`_.
Expand Down Expand Up @@ -152,7 +150,7 @@ def get_pdf_path(self, vol, issue, fpage):
if pattern in pdf_path:
return os.path.join(self.pdf_files, pdf_path)

def add_document(self, file_path, hidden, fulltext):
def make_document(self, file_path, hidden, fulltext):
"""Create a structured dictionary and add to 'files' item."""
file_dict = {
"hidden": hidden,
Expand All @@ -164,72 +162,40 @@ def add_document(self, file_path, hidden, fulltext):
return file_dict

def parse_node(self, response, node):
"""Parse the record XML and create a HEPRecord."""
record = HEPLoader(item=HEPRecord(), selector=node, response=response)

pub_status = self.get_pub_status(node)
if pub_status in {"aheadofprint", "received"}:
return None

fpage, lpage, page_nr = self.get_page_numbers(node)
volume = node.xpath(".//Journal/Volume/text()").extract_first()
issue = node.xpath(".//Journal/Issue/text()").extract_first()

record.add_value("journal_fpage", fpage)
record.add_value("journal_lpage", lpage)
record.add_xpath('abstract', ".//Abstract")
record.add_xpath("title", ".//ArticleTitle")
record.add_value('authors', self.get_authors(node))
journal_title = node.xpath(
".//Journal/JournalTitle/text()").extract_first()
record.add_value("journal_title", journal_title)
record.add_value("journal_issue", issue)
record.add_value("journal_volume", volume)
record.add_xpath("journal_issn", ".//Journal/Issn/text()")
record.add_dois(dois_values=self.get_dois(node))

journal_year = node.xpath(".//Journal/PubDate/Year/text()").extract()
if journal_year:
record.add_value("journal_year", int(journal_year[0]))

record.add_xpath("language", ".//Language/text()")
record.add_value("page_nr", page_nr)
record.add_value('date_published', self.get_date_published(node))
record.add_xpath('copyright_statement',
"./CopyrightInformation/text()")
record.add_xpath('copyright_holder', "//Journal/PublisherName/text()")
record.add_xpath(
'free_keywords', "ObjectList/Object[@Type='keyword']/Param[@Name='value']/text()")

record.add_xpath("related_article_doi", "//Replaces[@IdType='doi']/text()")
doctype = self.get_doctype(node) # FIXME: should these be mapped?
record.add_value("journal_doctype", doctype)
record.add_value('collections', self.get_collections(doctype))

xml_file_path = response.url
record.add_value(
"documents",
self.add_document(xml_file_path, hidden=True, fulltext=True),
"""Parse individual Article nodes to create a HEPRecord."""
parser = NLMParser(node, source='IOP')

xml_document = self.make_document(
file_path=response.url,
hidden=True,
fulltext=False,
)
parser.builder.add_document(**xml_document)

if self.pdf_files:
pdf_file_path = self.get_pdf_path(volume, issue, fpage)
pdf_file_path = self.get_pdf_path(
parser.journal_volume,
parser.journal_issue,
parser.page_start
)
if pdf_file_path:
if doctype and "erratum" in doctype.lower():
if parser.material == "erratum":
fulltext = False
else:
fulltext = True
if journal_title in self.OPEN_ACCESS_JOURNALS:
fulltext = True # FIXME
if parser.journal_title in self.OPEN_ACCESS_JOURNALS:
hidden = False
else:
hidden = True
record.add_value(
"documents",
self.add_document(pdf_file_path, hidden=hidden, fulltext=fulltext),

pdf_document = self.make_document(
file_path=pdf_file_path,
hidden=hidden,
fulltext=fulltext
)
parser.builder.add_document(**pdf_document)

parsed_item = ParsedItem(
record=record.load_item(),
record_format='hepcrawl',
return ParsedItem(
record=parser.parse(),
record_format='hep',
)

return parsed_item
Empty file removed t.json
Empty file.
Loading

0 comments on commit 75cc606

Please sign in to comment.