Skip to content

Commit

Permalink
Add multi-level table header handling
Browse files Browse the repository at this point in the history
resolves: #14
  • Loading branch information
creisle committed Mar 8, 2023
1 parent c734a33 commit 5350c7d
Show file tree
Hide file tree
Showing 4 changed files with 293 additions and 0 deletions.
90 changes: 90 additions & 0 deletions src/bioconverters/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,94 @@ def get_tag_path(mapping: Dict[etree.Element, etree.Element], node: etree.Elemen
return '/'.join((path[::-1]))


def first_empty_index(items) -> int:
"""
Return the index of the first falsy item in an iterable. Defaults to 0 if no items are falsy
"""
for i, item in enumerate(items):
if not item:
return i
return 0


def get_unique_child_element_index(elem: etree.Element, child_elem_type: str) -> int:
"""
Get a child element from an XML parent node and ensure that 1 and exactly 1 element is returned
Args:
elem: the element to search children of
child_elem_type: the tag type of the element in question
"""
indices = []
for i, child in enumerate(elem):
if child.tag == child_elem_type:
indices.append(i)
if not indices:
raise KeyError(f'unable to find child element with tag type = {child_elem_type}')
if len(indices) > 1:
raise ValueError(f'found multiple child elements with tag type = {child_elem_type}')
return indices[0]


def normalize_table(elem: etree.Element) -> etree.Element:
"""
Replace any multi-row table header with a single-row header by repeating col-spanning labels as prefixes on their sub-columns
"""
header_elem_index = get_unique_child_element_index(elem, 'thead')
header = elem[header_elem_index]

header_cols = 0
header_rows = len(header)
for row in header:
for header_cell in row:
header_cols += int(header_cell.attrib.get('colspan', 1))
break

header_matrix = []
filled_cells = []
for _ in range(header_rows):
row = []
for _ in range(header_cols):
row.append('')
header_matrix.append(row)
filled_cells.append([0 for _ in row])

for i_row, row in enumerate(header):
i_col = 0
for header_cell in row:
text = str(merge_text_chunks(chunk for chunk in tag_handler(header_cell)))
row_cells = [r + i_row for r in range(int(header_cell.attrib.get('rowspan', 1)))]
col_cells = [
r + first_empty_index(filled_cells[i_row])
for r in range(int(header_cell.attrib.get('colspan', 1)))
]

for r in row_cells:
for c in col_cells:
header_matrix[r][c] = text
filled_cells[r][c] = 1

for col in range(header_cols):
for row in range(1, header_rows)[::-1]:
if header_matrix[row][col] == header_matrix[row - 1][col]:
header_matrix[row][col] = ''

# now flatten the header rows
for row in header_matrix[1:]:
for i_col, col in enumerate(row):
if col:
header_matrix[0][i_col] += ' ' + col

result = [re.sub(r'[\s\n]+', ' ', col.strip()) for col in header_matrix[0]]
new_xml = []
for col in result:
new_xml.append(f'<th>{col}</th>')

new_header_elem = etree.fromstring(f'<thead><tr>{"".join(new_xml)}</tr></thead>')
elem[header_elem_index] = new_header_elem
return elem


def tag_handler(
elem: etree.Element, custom_handlers: Dict[str, TagHandlerFunction] = {}
) -> List[TextChunk]:
Expand All @@ -226,6 +314,8 @@ def tag_handler(
return custom_handlers[elem.tag](elem, custom_handlers=custom_handlers)
except NotImplementedError:
pass
if elem.tag == 'table':
elem = normalize_table(elem)
# Extract any raw text directly in XML element or just after
head = elem.text or ""
tail = elem.tail or ""
Expand Down
42 changes: 42 additions & 0 deletions tests/data/colspans_table.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
<?xml version="1.1" encoding="utf8" ?>
<!-- PMC7461630 -->
<table>
<thead>
<tr>
<th align="left" valign="top" rowspan="1" colspan="1"/>
<th align="left" valign="top" rowspan="1" colspan="1">All patients in <italic>NTRK</italic> gene fusion-positive efficacy-evaluable population (n=54)</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="top" rowspan="1" colspan="1">Age, years</td>
<td align="left" valign="top" rowspan="1" colspan="1">58 (48–67)</td>
</tr>
<tr>
<td colspan="2" align="left" valign="top" rowspan="1">Sex</td>
</tr>
<tr>
<td align="left" valign="top" rowspan="1" colspan="1"> Female</td>
<td align="left" valign="top" rowspan="1" colspan="1">32 (59%)</td>
</tr>
<tr>
<td align="left" valign="top" rowspan="1" colspan="1"> Male</td>
<td align="left" valign="top" rowspan="1" colspan="1">22 (41%)</td>
</tr>
<tr>
<td colspan="2" align="left" valign="top" rowspan="1">Race</td>
</tr>
<tr>
<td align="left" valign="top" rowspan="1" colspan="1"> White</td>
<td align="left" valign="top" rowspan="1" colspan="1">43 (80%)</td>
</tr>
<tr>
<td align="left" valign="top" rowspan="1" colspan="1"> Asian</td>
<td align="left" valign="top" rowspan="1" colspan="1">7 (13%)</td>
</tr>
<tr>
<td align="left" valign="top" rowspan="1" colspan="1"> Other</td>
<td align="left" valign="top" rowspan="1" colspan="1">4 (7%)</td>
</tr>
</tbody>
</table>
150 changes: 150 additions & 0 deletions tests/data/multi-level-table-header.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
<?xml version="1.1" encoding="utf8" ?>
<!-- PMC2873663 -->
<article>
<table frame="box" rules="all">
<thead>
<tr>
<th rowspan="3" align="center" valign="top" colspan="1">p53<break/>
MUTATION</th>
<th rowspan="3" align="center" valign="top" colspan="1">FUNCTIONAL<xref ref-type="table-fn" rid="TFN1">a</xref>
<break/>
STATUS</th>
<th colspan="3" align="center" valign="top" rowspan="1">IARC DATABASE<xref ref-type="table-fn" rid="TFN2">b</xref>
</th>
<th rowspan="3" align="center" valign="top" colspan="1">FEATURES<xref ref-type="table-fn" rid="TFN3">c</xref>
</th>
</tr>
<tr>
<th colspan="2" align="center" valign="top" rowspan="1">SOMATIC</th>
<th rowspan="2" align="center" valign="top" colspan="1">GERMLINE<break/>FAMILIES</th>
</tr>
<tr>
<th align="center" valign="top" rowspan="1" colspan="1">TOTAL</th>
<th align="center" valign="top" rowspan="1" colspan="1">BREAST</th>
</tr>
</thead>
<tbody>
<tr>
<td align="center" valign="top" rowspan="1" colspan="1">T125R</td>
<td align="center" valign="top" rowspan="1" colspan="1">ALTERED</td>
<td align="center" valign="top" rowspan="1" colspan="1">2</td>
<td align="center" valign="top" rowspan="1" colspan="1">1</td>
<td align="center" valign="top" rowspan="1" colspan="1">0</td>
<td align="center" valign="top" rowspan="1" colspan="1"/>
</tr>
<tr>
<td align="center" valign="top" rowspan="1" colspan="1">L130V</td>
<td align="center" valign="top" rowspan="1" colspan="1">ALTERED</td>
<td align="center" valign="top" rowspan="1" colspan="1">21</td>
<td align="center" valign="top" rowspan="1" colspan="1">3</td>
<td align="center" valign="top" rowspan="1" colspan="1">0</td>
<td align="center" valign="top" rowspan="1" colspan="1">Neo.</td>
</tr>
<tr>
<td align="center" valign="top" rowspan="1" colspan="1">C135F</td>
<td align="center" valign="top" rowspan="1" colspan="1">LOSS</td>
<td align="center" valign="top" rowspan="1" colspan="1">49</td>
<td align="center" valign="top" rowspan="1" colspan="1">3</td>
<td align="center" valign="top" rowspan="1" colspan="1">0</td>
<td align="center" valign="top" rowspan="1" colspan="1"/>
</tr>
<tr>
<td align="center" valign="top" rowspan="1" colspan="1">C135Y</td>
<td align="center" valign="top" rowspan="1" colspan="1">LOSS</td>
<td align="center" valign="top" rowspan="1" colspan="1">70</td>
<td align="center" valign="top" rowspan="1" colspan="1">11</td>
<td align="center" valign="top" rowspan="1" colspan="1">0</td>
<td align="center" valign="top" rowspan="1" colspan="1">Neo.</td>
</tr>
<tr>
<td align="center" valign="top" rowspan="1" colspan="1">A138V</td>
<td align="center" valign="top" rowspan="1" colspan="1">FUNCTIONAL</td>
<td align="center" valign="top" rowspan="1" colspan="1">48</td>
<td align="center" valign="top" rowspan="1" colspan="1">7</td>
<td align="center" valign="top" rowspan="1" colspan="1">0</td>
<td align="center" valign="top" rowspan="1" colspan="1"/>
</tr>
<tr>
<td align="center" valign="top" rowspan="1" colspan="1">C176F</td>
<td align="center" valign="top" rowspan="1" colspan="1">LOSS</td>
<td align="center" valign="top" rowspan="1" colspan="1">181</td>
<td align="center" valign="top" rowspan="1" colspan="1">7</td>
<td align="center" valign="top" rowspan="1" colspan="1">0</td>
<td align="center" valign="top" rowspan="1" colspan="1">L2-Zn</td>
</tr>
<tr>
<td align="center" valign="top" rowspan="1" colspan="1">H179R</td>
<td align="center" valign="top" rowspan="1" colspan="1">LOSS</td>
<td align="center" valign="top" rowspan="1" colspan="1">139</td>
<td align="center" valign="top" rowspan="1" colspan="1">16</td>
<td align="center" valign="top" rowspan="1" colspan="1">0</td>
<td align="center" valign="top" rowspan="1" colspan="1">L2-Zn; Neo.</td>
</tr>
<tr>
<td align="center" valign="top" rowspan="1" colspan="1">R181P</td>
<td align="center" valign="top" rowspan="1" colspan="1">LOSS</td>
<td align="center" valign="top" rowspan="1" colspan="1">22</td>
<td align="center" valign="top" rowspan="1" colspan="1">2</td>
<td align="center" valign="top" rowspan="1" colspan="1">1</td>
<td align="center" valign="top" rowspan="1" colspan="1">L2; FH</td>
</tr>
<tr>
<td align="center" valign="top" rowspan="1" colspan="1">S183L</td>
<td align="center" valign="top" rowspan="1" colspan="1">LOSS</td>
<td align="center" valign="top" rowspan="1" colspan="1">3</td>
<td align="center" valign="top" rowspan="1" colspan="1">1</td>
<td align="center" valign="top" rowspan="1" colspan="1">0</td>
<td align="center" valign="top" rowspan="1" colspan="1">L2</td>
</tr>
<tr>
<td align="center" valign="top" rowspan="1" colspan="1">P190L</td>
<td align="center" valign="top" rowspan="1" colspan="1">ALTERED</td>
<td align="center" valign="top" rowspan="1" colspan="1">48</td>
<td align="center" valign="top" rowspan="1" colspan="1">4</td>
<td align="center" valign="top" rowspan="1" colspan="1">0</td>
<td align="center" valign="top" rowspan="1" colspan="1">L2; BRCA1</td>
</tr>
<tr>
<td align="center" valign="top" rowspan="1" colspan="1">L194P</td>
<td align="center" valign="top" rowspan="1" colspan="1">ALTERED</td>
<td align="center" valign="top" rowspan="1" colspan="1">14</td>
<td align="center" valign="top" rowspan="1" colspan="1">1</td>
<td align="center" valign="top" rowspan="1" colspan="1">0</td>
<td align="center" valign="top" rowspan="1" colspan="1">L2; BRCA</td>
</tr>
<tr>
<td align="center" valign="top" rowspan="1" colspan="1">L194R</td>
<td align="center" valign="top" rowspan="1" colspan="1">LOSS</td>
<td align="center" valign="top" rowspan="1" colspan="1">55</td>
<td align="center" valign="top" rowspan="1" colspan="1">9</td>
<td align="center" valign="top" rowspan="1" colspan="1">0</td>
<td align="center" valign="top" rowspan="1" colspan="1">L2</td>
</tr>
<tr>
<td align="center" valign="top" rowspan="1" colspan="1">H214R</td>
<td align="center" valign="top" rowspan="1" colspan="1">ALTERED</td>
<td align="center" valign="top" rowspan="1" colspan="1">72</td>
<td align="center" valign="top" rowspan="1" colspan="1">5</td>
<td align="center" valign="top" rowspan="1" colspan="1">0</td>
<td align="center" valign="top" rowspan="1" colspan="1">BRCA2</td>
</tr>
<tr>
<td align="center" valign="top" rowspan="1" colspan="1">Y220C</td>
<td align="center" valign="top" rowspan="1" colspan="1">ALTERED</td>
<td align="center" valign="top" rowspan="1" colspan="1">315</td>
<td align="center" valign="top" rowspan="1" colspan="1">41</td>
<td align="center" valign="top" rowspan="1" colspan="1">4</td>
<td align="center" valign="top" rowspan="1" colspan="1">LFS; Neo.</td>
</tr>
<tr>
<td align="center" valign="top" rowspan="1" colspan="1">G245S</td>
<td align="center" valign="top" rowspan="1" colspan="1">LOSS</td>
<td align="center" valign="top" rowspan="1" colspan="1">396</td>
<td align="center" valign="top" rowspan="1" colspan="1">35</td>
<td align="center" valign="top" rowspan="1" colspan="1">18</td>
<td align="center" valign="top" rowspan="1" colspan="1">L3; LFS, LFL, FH; Neo.;<break/>
BRCA1</td>
</tr>
</tbody>
</table>
</article>
11 changes: 11 additions & 0 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -404,6 +404,17 @@ def test_floating_table():
assert len(table_body[0].split(TABLE_DELIMITER)) == expected_columns * expected_rows


def test_multilevel_table_header():
xml_input = data_file_path('multi-level-table-header.xml')
with open(xml_input, 'r') as fh:
xml_data = fh.read()
chunks = extract_text_chunks([etree.fromstring(xml_data)])
table_header = [c.text for c in chunks if c.xml_path.endswith('thead')]
assert table_header == [
'p53 MUTATION\tFUNCTIONAL a STATUS\tIARC DATABASE b SOMATIC TOTAL\tIARC DATABASE b SOMATIC BREAST\tIARC DATABASE b GERMLINE FAMILIES\tFEATURES c'
]


@pytest.mark.parametrize(
'input,output',
[
Expand Down

0 comments on commit 5350c7d

Please sign in to comment.