Skip to content

Commit

Permalink
Rewrite parse reference function so it's easier to read
Browse files Browse the repository at this point in the history
  • Loading branch information
titipata committed Oct 6, 2016
1 parent f3bc5d1 commit 8dd7fe0
Show file tree
Hide file tree
Showing 2 changed files with 50 additions and 66 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ import pubmed_parser as pp
dict_out = pp.parse_pubmed_xml(path)
```

#### Parse Pubmed OA citations
#### Parse Pubmed OA citation references

The function `parse_pubmed_references` will process a Pubmed Open Access XML
file and return a list of the PMID it cites.
Expand Down Expand Up @@ -112,7 +112,7 @@ will return list of dictionaries where each has following keys.
- `table_xml`: raw xml text of the table (return if `return_xml=True`)

```python
dicts_out = pp.parse_pubmed_table('data/medline16n0902.xml.gz')
dicts_out = pp.parse_pubmed_table('data/medline16n0902.xml.gz', return_xml=False)
```

#### Parse Medline NML XML
Expand Down
112 changes: 48 additions & 64 deletions pubmed_parser/pubmed_oa_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,74 +168,70 @@ def parse_pubmed_xml(path, include_path=False):
return dict_out


def parse_references(tree):
def parse_pubmed_references(path):
"""
Give a tree as an input,
parse it to dictionary if ref-id and cited PMID
Given path to xml file, parse references articles
to list of dictionary
"""
tree = read_xml(path)
dict_article_meta = parse_article_meta(tree)
pmid = dict_article_meta['pmid']
pmc = dict_article_meta['pmc']

references = tree.xpath('//ref-list/ref[@id]')
dict_refs = list()
for r in references:
ref_id = r.attrib['id']
for rc in r:
if 'publication-type' in rc.attrib.keys():
if rc.attrib.values() is not None:
journal_type = rc.attrib.values()[0]
else:
journal_type = ''
names = list()
if rc.find('name') is not None:
for n in rc.findall('name'):
name = ' '.join([t.text for t in n.getchildren()][::-1])
names.append(name)
elif rc.find('person-group') is not None:
for n in rc.find('person-group'):
name = ' '.join(n.xpath('given-names/text()') + n.xpath('surname/text()'))
names.append(name)
try:
article_title = rc.findall('article-title')[0].text
except:
article_title = ''
try:
journal = rc.findall('source')[0].text
except:
journal = ''
try:
pmid_cited = rc.findall('pub-id[@pub-id-type="pmid"]')[0].text
except:
pmid_cited = ''
dict_ref = {'ref_id': ref_id,
'name': '; '.join(names),
'article_title': article_title,
'journal': journal,
'journal_type': journal_type,
'pmid': pmid,
'pmc': pmc,
'pmid_cited': pmid_cited}
dict_refs.append(dict_ref)
for reference in references:
ref_id = reference.attrib['id']
ref = reference.find('mixed-citation')
if 'publication-type' in ref.attrib.keys() and ref is not None:
if ref.attrib.values() is not None:
journal_type = ref.attrib.values()[0]
else:
journal_type = ''
names = list()
if ref.find('name') is not None:
for n in ref.findall('name'):
name = ' '.join([t.text for t in n.getchildren()][::-1])
names.append(name)
elif ref.find('person-group') is not None:
for n in ref.find('person-group'):
name = ' '.join(n.xpath('given-names/text()') + n.xpath('surname/text()'))
names.append(name)
if ref.find('article-title') is not None:
article_title = ref.find('article-title').text or ''
article_title = article_title.replace('\n', ' ')
else:
article_title = ''
if ref.find('source') is not None:
journal = ref.find('source').text or ''
else:
journal = ''
if ref.find('pub-id[@pub-id-type="pmid"]') is not None:
pmid_cited = ref.find('pub-id[@pub-id-type="pmid"]').text or ''
else:
pmid_cited = ''
dict_ref = {'ref_id': ref_id,
'name': '; '.join(names),
'article_title': article_title,
'journal': journal,
'journal_type': journal_type,
'pmid': pmid,
'pmc': pmc,
'pmid_cited': pmid_cited}
dict_refs.append(dict_ref)
if len(dict_refs) == 0:
dict_refs = None
return dict_refs


def parse_pubmed_references(path):
"""
Given path to xml file, parse all references
from that PMID
"""
tree = read_xml(path)
dict_refs = parse_references(tree)
return dict_refs


def parse_paragraph(tree, dict_refs):
def parse_pubmed_paragraph(path):
"""
Give tree and reference dictionary
return dictionary of referenced paragraph, section that it belongs to,
and its cited PMID
"""
tree = read_xml(path)
dict_refs = parse_pubmed_references(path)
dict_article_meta = parse_article_meta(tree)
pmid = dict_article_meta['pmid']
pmc = dict_article_meta['pmc']
Expand Down Expand Up @@ -281,24 +277,12 @@ def parse_paragraph(tree, dict_refs):
return dict_pars


def parse_pubmed_paragraph(path):
"""
Given single xml path, extract information from xml file
and return parsed xml file in dictionary format.
"""
tree = read_xml(path)
dict_refs = parse_references(tree)
dict_pars = parse_paragraph(tree, dict_refs)
return dict_pars


def parse_pubmed_caption(path):
"""
Given single xml path, extract figure caption and
reference id back to that figure
"""
tree = read_xml(path)

dict_article_meta = parse_article_meta(tree)
pmid = dict_article_meta['pmid']
pmc = dict_article_meta['pmc']
Expand Down

0 comments on commit 8dd7fe0

Please sign in to comment.