Skip to content

Commit

Permalink
Separate passages with newline
Browse files Browse the repository at this point in the history
Relates to: #16
  • Loading branch information
creisle committed Mar 15, 2023
1 parent 4c07ab0 commit 6df1e2d
Show file tree
Hide file tree
Showing 3 changed files with 17 additions and 5 deletions.
6 changes: 6 additions & 0 deletions src/bioconverters/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -587,4 +587,10 @@ def extract_text_chunks(
for chunk in merged_chunks:
chunk.xml_path = get_tag_path(mapping, chunk.xml_node)

result = []
for chunk in merged_chunks:
if chunk.text:
if result and result[-1].text[-1] != '\n':
result[-1].text = result[-1].text + '\n'
result.append(chunk)
return [c for c in merged_chunks if c.text]
4 changes: 2 additions & 2 deletions tests/test_pmcxml.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,9 @@ def citation_offset_article():
def test_convert_pmc_with_table(table_article):
file = StringIO(table_article)
table_header = (
'ERBB2 mutation\tExon\tFunctional region\tCancer type\tLapatinib\tAEE788\tReference'
'ERBB2 mutation\tExon\tFunctional region\tCancer type\tLapatinib\tAEE788\tReference\n'
)
expected_content = "WT\tNA\tNA\tBreast cancer\t30\t257\tNA\tL755S\t19\tATP binding region\tBreast and gastric cancer\t>2000\t897\t4\tL755P 19\tATP binding region\tNSCLC\t1545\t1216\t2,3\tV773A\t20\tATP binding region\tSCCHN\t146\t200\t6\tV777L\t20\tATP binding region\tGastric, colon and lung\t27\t215\t3,4\tT798M\t20\tGate keeper residue\tNA\t1433\t>2000\tNA\tN857S\t21\tActivation loop\tOvarian cancer\t75\t246\t2\tT862A\t21\tActivation loop\tPrimary gastric cancer\t125\t191\t7\tH878Y\t21\tActivation loop\tHepatocellular carcinoma\t14\t168\t5"
expected_content = "WT\tNA\tNA\tBreast cancer\t30\t257\tNA\tL755S\t19\tATP binding region\tBreast and gastric cancer\t>2000\t897\t4\tL755P 19\tATP binding region\tNSCLC\t1545\t1216\t2,3\tV773A\t20\tATP binding region\tSCCHN\t146\t200\t6\tV777L\t20\tATP binding region\tGastric, colon and lung\t27\t215\t3,4\tT798M\t20\tGate keeper residue\tNA\t1433\t>2000\tNA\tN857S\t21\tActivation loop\tOvarian cancer\t75\t246\t2\tT862A\t21\tActivation loop\tPrimary gastric cancer\t125\t191\t7\tH878Y\t21\tActivation loop\tHepatocellular carcinoma\t14\t168\t5\n"
all_passages = []
for doc in docs2bioc(file, 'pmcxml', trim_sentences=False, all_xml_path_infon=True):
all_passages.extend(doc.passages)
Expand Down
12 changes: 9 additions & 3 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,10 @@ def test_extract_text_chunks_sibling_xrefs():
'The 2-year invasive disease-free survival rate was 93·9%',
'The 2-year invasive disease-free survival rate was 93.9%',
],
[
'<sec><title>Title of a thing</title><p>paragraph content</p></sec>',
'Title of a thing\nparagraph content',
],
[
'Compared with <italic>KRAS</italic> wild type and empty vector controls, <italic>KRAS</italic> <sup>10</sup>G<sup>11</sup> and <sup>11</sup>GA<sup>12</sup> significantly enhanced in vivo tumor growth',
'Compared with KRAS wild type and empty vector controls, KRAS 10G11 and 11GA12 significantly enhanced in vivo tumor growth',
Expand Down Expand Up @@ -241,7 +245,7 @@ def test_extract_figure_label():
assert not annotations_map
xml_paths = [c.xml_path for c in chunks]
assert 'article/fig/label' in xml_paths
assert 'Figure 3' in [c.text for c in chunks if c.xml_path == 'article/fig/label']
assert 'Figure 3\n' in [c.text for c in chunks if c.xml_path == 'article/fig/label']


@pytest.mark.parametrize(
Expand Down Expand Up @@ -404,7 +408,7 @@ def test_floating_table():

assert len(table_header) == 1
header = table_header[0].split(TABLE_DELIMITER)
assert header == ['Patient sample', 'Exon', 'DNA', 'Protein', 'Domain', 'Germline/ Somatic']
assert header == ['Patient sample', 'Exon', 'DNA', 'Protein', 'Domain', 'Germline/ Somatic\n']

table_body = [c.text for c in chunks if c.xml_path.endswith('tbody')]
assert len(table_body) == 1
Expand All @@ -418,8 +422,10 @@ def test_multilevel_table_header():
chunks = extract_text_chunks([etree.fromstring(xml_data)])
table_header = [c.text for c in chunks if c.xml_path.endswith('thead')]
assert table_header == [
'p53 MUTATION\tFUNCTIONAL a STATUS\tIARC DATABASE b SOMATIC TOTAL\tIARC DATABASE b SOMATIC BREAST\tIARC DATABASE b GERMLINE FAMILIES\tFEATURES c'
'p53 MUTATION\tFUNCTIONAL a STATUS\tIARC DATABASE b SOMATIC TOTAL\tIARC DATABASE b SOMATIC BREAST\tIARC DATABASE b GERMLINE FAMILIES\tFEATURES c\n'
]
table_body = [c.text for c in chunks if c.xml_path.endswith('tbody')]
assert 'L130V\tALTERED\t' in table_body[0]


@pytest.mark.parametrize(
Expand Down

0 comments on commit 6df1e2d

Please sign in to comment.