diff --git a/hepcrawl/parsers/nlm.py b/hepcrawl/parsers/nlm.py new file mode 100644 index 00000000..588619c4 --- /dev/null +++ b/hepcrawl/parsers/nlm.py @@ -0,0 +1,334 @@ +# -*- coding: utf-8 -*- +# +# This file is part of hepcrawl. +# Copyright (C) 2018 CERN. +# +# hepcrawl is a free software; you can redistribute it and/or modify it +# under the terms of the Revised BSD License; see LICENSE file for +# more details. + +"""Parser for NLM data format""" + +from __future__ import absolute_import, division, print_function + +import six + +from itertools import chain + +from inspire_schemas.api import LiteratureBuilder +from inspire_utils.date import PartialDate +from inspire_utils.helpers import maybe_int +from inspire_utils.name import ParsedName + +from ..utils import get_node + + +NLM_OBJECT_TYPE_TO_HEP_MAP = { + 'Erratum': 'erratum', + 'Reprint': 'reprint', + 'Update': 'addendum', + 'Dataset': 'data', +} + + +class NLMParser(object): + """Parser for the NLM format. + + It can be used directly by invoking the :func:`NLMParser.parse` method, + or be subclassed to customize its behavior. + + Args: + nlm_record (Union[string, scrapy.selector.Selector]): the record in NLM + format to parse. + source (Optional[string]): if provided, sets the ``source`` everywhere + in the record. Otherwise the source is extracted from the metadata. + """ + def __init__(self, nlm_record, source=None): + self.root = self.get_root_node(nlm_record) + if not source: + source = self.publisher + self.builder = LiteratureBuilder(source) + + def parse(self): + """Extract an NLM record into an Inspire HEP record. + + Returns: + dict: the same record in the Inspire Literature schema. + """ + self.builder.add_abstract(self.abstract) + self.builder.add_title(self.title) + self.builder.add_copyright(**self.copyright) + self.builder.add_document_type(self.document_type) + for author in self.authors: + self.builder.add_author(author) + self.builder.add_publication_info(**self.publication_info) + self.builder.add_publication_type(self.publication_type) + for collab in self.collaborations: + self.builder.add_collaboration(collab) + for doi in self.dois: + self.builder.add_doi(**doi) + for keyword in self.keywords: + self.builder.add_keyword(keyword) + self.builder.add_imprint_date(self.print_publication_date.dumps()) + + return self.builder.record + + @classmethod + def bulk_parse(cls, nlm_records, source=None): + """Parse a whole ArticleSet. + + Args: + nlm_records (Union[string, scrapy.selector.Selector]): records + source (Optional[string]): source passed to `__init__` + """ + root = cls.get_root_node(nlm_records) + nlm_records = root.xpath('/ArticleSet/Article').extract() + return [ + cls(nlm_record, source=source).parse() + for nlm_record in nlm_records + ] + + @property + def abstract(self): + return self.root.xpath('normalize-space(./Abstract)').extract_first() + + @property + def title(self): + return self.root.xpath('./ArticleTitle/text()').extract_first() + + @property + def copyright(self): + return { + 'material': self.material, + 'statement': self.copyright_statement, + } + + @property + def copyright_statement(self): + return self.root.xpath( + 'normalize-space(./CopyrightInformation)' + ).extract_first() + + @property + def document_type(self): + """Return an applicable inspire document_type. + + For list of NLM PublicationTypes see: + www.ncbi.nlm.nih.gov/books/NBK3828/#publisherhelp.PublicationType_O + """ + pub_type = self.root.xpath('./PublicationType/text()').extract_first() + + if 'Conference' in pub_type or pub_type == 'Congresses': + return 'proceedings' + if 'Report' in pub_type: + return 'report' + + return 'article' + + @property + def publication_type(self): + """Return an applicable inspire publication_type. + + For list of NLM PublicationTypes see: + www.ncbi.nlm.nih.gov/books/NBK3828/#publisherhelp.PublicationType_O + """ + pub_type = self.root.xpath('./PublicationType/text()').extract_first() + + if pub_type == 'Lectures': + return 'lectures' + if pub_type == 'Review': + return 'review' + + @property + def authors(self): + authors = self.root.xpath('./AuthorList/Author') + authors_in_collaborations = self.root.xpath( + './GroupList/Group' + '[GroupName/text()=../../AuthorList/Author/CollectiveName/text()]' + '/IndividualName' + ) + return [ + self.get_author(author) + for author in chain(authors, authors_in_collaborations) + if self.get_author(author) is not None + ] + + @property + def publication_info(self): + pub_date = self.print_publication_date or self.online_publication_date + + publication_info = { + 'journal_title': self.journal_title, + 'journal_issue': self.journal_issue, + 'journal_volume': self.journal_volume, + 'material': self.material, + 'page_start': self.page_start, + 'page_end': self.page_end, + 'year': pub_date.year, + } + + return publication_info + + @property + def journal_title(self): + return self.root.xpath('./Journal/JournalTitle/text()').extract_first() + + @property + def journal_issue(self): + return self.root.xpath('./Journal/Issue/text()').extract_first() + + @property + def journal_volume(self): + return self.root.xpath('./Journal/Volume/text()').extract_first() + + @property + def material(self): + object_type = self.root.xpath('Object/@Type').extract_first() + + if object_type in NLM_OBJECT_TYPE_TO_HEP_MAP: + return NLM_OBJECT_TYPE_TO_HEP_MAP[object_type] + + return 'publication' + + + @property + def page_start(self): + return self.root.xpath('./FirstPage/text()').extract_first() + + @property + def page_end(self): + return self.root.xpath('./LastPage/text()').extract_first() + + @property + def collaborations(self): + return self.root.xpath('.//Author/CollectiveName/text()').extract() + + @property + def dois(self): + dois = self.root.xpath( + './/ArticleIdList/ArticleId[@IdType="doi"]/text()' + ).extract() + + if not dois: + dois = self.root.xpath( + './/ELocationID[@EIdType="doi"]/text()' + ).extract() + + return [{'doi': value, 'material': self.material} for value in dois] + + @property + def keywords(self): + return self.root.xpath( + './ObjectList/Object[@Type="keyword"]/Param[@Name="value"]/text()' + ).extract() + + @property + def print_publication_date(self): + """Date of the print publication. + + PubDate tags may appear in root of the Article or as part of + article's History. + """ + pub_date = self.root.xpath('.//PubDate[@PubStatus="ppublish"]') + pub_date_no_tag = self.root.xpath('.//PubDate[not(@PubStatus)]') + return self.partial_date_from_date_node(pub_date or pub_date_no_tag) + + @property + def online_publication_date(self): + """Date of the only-only publication. + + PubDate tags may appear in root of the Article or as part of + article's History. + """ + pub_date = self.root.xpath('.//PubDate[@PubStatus="epublish"]') + return self.partial_date_from_date_node(pub_date) + + @property + def publisher(self): + return self.root.xpath( + './Journal/PublisherName/text()' + ).extract_first() + + @staticmethod + def get_root_node(record): + """Get a selector on the root ``ArticleSet`` node of the record. + + This can be overridden in case some preprocessing needs to be done on + the XML. + + Args: + record(Union[str, scrapy.selector.Selector]): + the record in NLM format. + + Returns: + scrapy.selector.Selector: a selector on the root ``
`` + node. + """ + if isinstance(record, six.string_types): + root = get_node(record) + else: + root = record + + return root + + def get_author(self, author_node): + """Get HEP conforming author information + + Args: + author_node(scrapy.selector.Selector): node + + Returns: + dict: extracted author information + """ + first = author_node.xpath('./FirstName/text()').extract_first() + middle = author_node.xpath('./MiddleName/text()').extract_first() + last = author_node.xpath('./LastName/text()').extract_first() + suffix = author_node.xpath('./Suffix/text()').extract_first() + full_name = ParsedName.from_parts(first, last, middle, suffix).dumps() + + affiliations = author_node.xpath('.//Affiliation/text()').extract() + affiliations = [self.normalize_space(aff) for aff in affiliations] + ids = author_node.xpath('./Identifier/text()').extract() + + return self.builder.make_author( + full_name, + raw_affiliations=affiliations, + ids=[(None, id_) for id_ in ids], + ) + + @staticmethod + def partial_date_from_date_node(node): + """Parse an XML date node into PartialDate, if possible. + + Args: + node (scrapy.selector.Selector): an XML node to parse + + Returns: + Union[PartialDate, None]: a PartialDate of None if couldn't parse + """ + try: + day = node.xpath('./Day/text()').extract_first() + month = node.xpath('./Month/text()').extract_first() + year = node.xpath('./Year/text()').extract_first() + return PartialDate( + maybe_int(year), + maybe_int(month), + maybe_int(day) + ) + except ValueError: + return None + + @staticmethod + def normalize_space(text): + """XML normalize space. + + Removes leading and trailing whitespace, + replaces strings of whitespace with single space. + + Args: + text (string): input string + + Returns: + string: normalized string + """ + return " ".join(text.split()) diff --git a/tests/unit/responses/iop/expected.yaml b/tests/unit/responses/iop/expected.yaml new file mode 100644 index 00000000..335c3833 --- /dev/null +++ b/tests/unit/responses/iop/expected.yaml @@ -0,0 +1,82 @@ +abstract: Somatic BRAF mutation in colon cancer essentially excludes Lynch + syndrome. We compared BRAF V600E immunohistochemistry (IHC) with BRAF + mutation in core, biopsy, and whole-section slides to determine whether + IHC is similar and to assess the cost-benefit of IHC. Resection cases + (2009-2013) with absent MLH1 and PMS2 and prior BRAF mutation polymerase + chain reaction results were chosen (n = 57). To mimic biopsy specimens, + tissue microarrays (TMAs) were constructed. In addition, available biopsies + performed prior to the resection were available in 15 cases. BRAF V600E IHC + was performed and graded on TMAs, available biopsy specimens, and + whole-section slides. Mutation status was compared with IHC, and + cost-benefit analysis was performed. BRAF V600E IHC was similar in TMAs, + biopsy specimens, and whole-section slides, with only four (7%) showing + discordance between IHC and mutation status. Using BRAF V600E IHC in our + Lynch syndrome screening algorithm, we found a 10% cost savings compared + with mutational analysis. BRAF V600E IHC was concordant between TMAs, + biopsy specimens, and whole-section slides, suggesting biopsy specimens are + as useful as whole sections. IHC remained cost beneficial compared with + mutational analysis, even though more patients needed additional molecular + testing to exclude Lynch syndrome. +title: 'A Modified Lynch Syndrome Screening Algorithm in Colon Cancer: BRAF + Immunohistochemistry Is Efficacious and Cost Beneficial.' +copyright_statement: Copyright© by the American Society for Clinical Pathology +document_type: article +publication_type: null +authors: +- full_name: Roth, Rachel M. + raw_affiliations: + - value: Department of Pathology, + The Ohio State University Wexner Medical Center, Columbus + source: American Society for Clinical Pathology +- full_name: Hampel, Heather + raw_affiliations: + - value: Department of Human Genetics, + The Ohio State University Wexner Medical Center Columbus + source: American Society for Clinical Pathology +- full_name: Arnold, Christina A. + raw_affiliations: + - value: Department of Pathology, + The Ohio State University Wexner Medical Center, Columbus + source: American Society for Clinical Pathology + - value: Department of Microbiology, + The Ohio State University Wexner Medical Center, Columbus + source: American Society for Clinical Pathology +- full_name: Yearsley, Martha M. + raw_affiliations: + - value: Department of Pathology, + The Ohio State University Wexner Medical Center, Columbus + source: American Society for Clinical Pathology +- full_name: Marsh, William L. + raw_affiliations: + - value: Department of Pathology, + The Ohio State University Wexner Medical Center, Columbus + source: American Society for Clinical Pathology +- full_name: Frankel, Wendy L. + raw_affiliations: + - value: Department of Pathology, + The Ohio State University Wexner Medical Center, Columbus + source: American Society for Clinical Pathology + - value: Department of Human Genetics, + The Ohio State University Wexner Medical Center, Columbus + source: American Society for Clinical Pathology +- full_name: Smith, John +- full_name: Jones, Mary +journal_title: Am J Clin Pathol +journal_issue: '3' +journal_volume: '143' +material: publication +page_end: '343' +page_start: '336' +collaborations: +- Cancer Genome Center +dois: +- doi: 10.1309/AJCP4D7RXOBHLKGJ + material: publication +keywords: +- BRAF +- MLH1 +- Immunohistochemistry +- Cost-benefit analysis +print_publication_date: 2015-03 +online_publication_date: null +publisher: American Society for Clinical Pathology \ No newline at end of file diff --git a/tests/unit/responses/iop/xml/test_standard.xml b/tests/unit/responses/iop/xml/test_standard.xml index 74aad82f..2d91653a 100644 --- a/tests/unit/responses/iop/xml/test_standard.xml +++ b/tests/unit/responses/iop/xml/test_standard.xml @@ -71,7 +71,23 @@ Columbus + + Cancer Genome Center + + + + Cancer Genome Center + + John + Smith + + + Mary + Jones + + + Journal Article j143/3/336 diff --git a/tests/unit/test_iop.py b/tests/unit/test_iop.py index 9061da32..d16977d7 100644 --- a/tests/unit/test_iop.py +++ b/tests/unit/test_iop.py @@ -117,29 +117,6 @@ def test_publication_info(record): assert record["journal_issn"][0] == journal_issn -def test_authors(record): - """Test authors.""" - authors = ['Roth, Rachel M', 'Hampel, Heather', 'Arnold, Christina A', - 'Yearsley, Martha M', 'Marsh, William L', 'Frankel, Wendy L'] - - affiliations = [ - [{'value': u'Department of Pathology, The Ohio State University Wexner Medical Center, Columbus'}], - [{'value': u'Department of Human Genetics, The Ohio State University Wexner Medical Center Columbus'}], - [{'value': u'Department of Pathology, The Ohio State University Wexner Medical Center, Columbus'}, - {'value': u'Department of Microbiology, The Ohio State University Wexner Medical Center, Columbus'}], - [{'value': u'Department of Pathology, The Ohio State University Wexner Medical Center, Columbus'}], - [{'value': u'Department of Pathology, The Ohio State University Wexner Medical Center, Columbus'}], - [{'value': u'Department of Pathology, The Ohio State University Wexner Medical Center, Columbus'}, - {'value': u'Department of Human Genetics, The Ohio State University Wexner Medical Center, Columbus'}] - ] - - assert "authors" in record - assert len(record["authors"]) == 6 - for index, (name, aff) in enumerate(zip(authors, affiliations)): - assert record["authors"][index]["full_name"] == name - assert record["authors"][index]["affiliations"] == aff - - def test_copyrights(record): """Test extracting copyright.""" copyright_holder = "American Society for Clinical Pathology" diff --git a/tests/unit/test_parsers_nlm.py b/tests/unit/test_parsers_nlm.py new file mode 100644 index 00000000..335f2e15 --- /dev/null +++ b/tests/unit/test_parsers_nlm.py @@ -0,0 +1,100 @@ +# -*- coding: utf-8 -*- +# +# This file is part of hepcrawl. +# Copyright (C) 2018 CERN. +# +# hepcrawl is a free software; you can redistribute it and/or modify it +# under the terms of the Revised BSD License; see LICENSE file for +# more details. + +from __future__ import ( + absolute_import, + division, + print_function, +) + +import pytest +import yaml + +from inspire_schemas.utils import validate +from hepcrawl.testlib.fixtures import get_test_suite_path +from hepcrawl.parsers.nlm import NLMParser + + +@pytest.fixture(scope='module') +def expected(): + """A dictionary holding the parsed elements of the record.""" + path = get_test_suite_path('responses', 'iop', 'expected.yaml') + with open(path) as f: + nlm_expected_dict = yaml.load(f) + + return nlm_expected_dict + + +@pytest.fixture(scope='module') +def xml_test_string(): + path = get_test_suite_path('responses', 'iop', 'xml', 'test_standard.xml') + with open(path) as f: + return f.read() + + +@pytest.fixture(scope='module') +def parser(xml_test_string): + """An NLMParser instanciated on a PubMed article.""" + root = NLMParser.get_root_node(xml_test_string) + article = root.xpath('/ArticleSet/Article').extract_first() + return NLMParser(article) + + +def test_bulk_parse(xml_test_string): + for record in NLMParser.bulk_parse(xml_test_string): + assert validate(record, 'hep') == None + + +FIELDS_TO_CHECK = [ + 'abstract', + 'title', + 'copyright_statement', + 'document_type', + 'publication_type', + 'authors', + 'journal_title', + 'journal_issue', + 'journal_volume', + 'material', + 'page_start', + 'page_end', + 'collaborations', + 'dois', + 'keywords', + 'online_publication_date', + 'publisher', +] +FIELDS_TO_CHECK_SEPARATELY = [ + 'print_publication_date', +] + + +def test_data_completeness(expected): + tested_fields = FIELDS_TO_CHECK + FIELDS_TO_CHECK_SEPARATELY + for field in expected.keys(): + assert field in tested_fields + + +@pytest.mark.parametrize( + 'field_name', + FIELDS_TO_CHECK +) +def test_field(field_name, expected, parser): + # if field_name == 'authors': + # import pdb + # pdb.set_trace() + + result = getattr(parser, field_name) + expected = expected[field_name] + + assert result == expected + + +def test_print_publication_date(expected, parser): + assert expected['print_publication_date'] == parser.print_publication_date.dumps()