Skip to content

Commit

Permalink
Add parsing updates
Browse files Browse the repository at this point in the history
Deals with several edge-cases to create a more human-readable output
that matches the input more as expected by the reader

resolves: #16
  • Loading branch information
creisle committed Mar 8, 2023
1 parent 19cce75 commit 8c5991c
Show file tree
Hide file tree
Showing 4 changed files with 334 additions and 57 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,4 @@ snakejob.*

listings/
dist
debug
50 changes: 50 additions & 0 deletions src/bioconverters/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
GREEK_ALPHABET = {
'\u0391': 'Alpha',
'\u0392': 'Beta',
'\u0393': 'Gamma',
'\u0394': 'Delta',
'\u0395': 'Epsilon',
'\u0396': 'Zeta',
'\u0397': 'Eta',
'\u0398': 'Theta',
'\u0399': 'Iota',
'\u039A': 'Kappa',
'\u039B': 'Lambda',
'\u039C': 'Mu',
'\u039D': 'Nu',
'\u039E': 'Xi',
'\u039F': 'Omicron',
'\u03A0': 'Pi',
'\u03A1': 'Rho',
'\u03A3': 'Sigma',
'\u03A4': 'Tau',
'\u03A5': 'Upsilon',
'\u03A6': 'Phi',
'\u03A7': 'Chi',
'\u03A8': 'Psi',
'\u03A9': 'Omega',
'\u03B1': 'alpha',
'\u03B2': 'beta',
'\u03B3': 'gamma',
'\u03B4': 'delta',
'\u03B5': 'epsilon',
'\u03B6': 'zeta',
'\u03B7': 'eta',
'\u03B8': 'theta',
'\u03B9': 'iota',
'\u03BA': 'kappa',
'\u03BB': 'lambda',
'\u03BC': 'mu',
'\u03BD': 'nu',
'\u03BE': 'xi',
'\u03BF': 'omicron',
'\u03C0': 'pi',
'\u03C1': 'rho',
'\u03C3': 'sigma',
'\u03C4': 'tau',
'\u03C5': 'upsilon',
'\u03C6': 'phi',
'\u03C7': 'chi',
'\u03C8': 'psi',
'\u03C9': 'omega',
}
145 changes: 101 additions & 44 deletions src/bioconverters/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,13 @@
import unicodedata
import uuid
import xml.etree.cElementTree as etree
from copy import copy
from typing import Callable, Dict, Iterable, List, Optional, Tuple

import bioc
from unidecode import unidecode

from .constants import GREEK_ALPHABET

# XML elements to ignore the contents of
IGNORE_LIST = [
Expand All @@ -19,7 +23,7 @@
"tex-math",
"mml:math",
"object-id",
"ext-link",
"ext-link", # TODO: should we keep URL content? some of these have text rather than the URL as inner content
]

# XML elements to separate text between (into different passages)
Expand All @@ -34,15 +38,18 @@
"label",
]


TABLE_DELIMITER = '\t'
TABLE_DELIMITED_TAGS = {'tr', 'th', 'td'}
# Tags that should be pre-pended with a space on merge
PSEUDO_SPACE_TAGS = {'sup', 'break'}
ANNOTATION_MARKER_PATTERN = r'ANN_[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}'


class TextChunk:
text: str
xml_node: str
xml_path: str
non_separating: bool = False
is_tail: bool = False
is_annotation: bool = False

Expand All @@ -51,14 +58,12 @@ def __init__(
text,
xml_node,
xml_path=None,
non_separating=False,
is_tail=False,
is_annotation=False,
):
self.text = text
self.xml_node = xml_node
self.xml_path = xml_path
self.non_separating = non_separating or is_annotation
self.is_tail = is_tail
self.is_annotation = is_annotation

Expand All @@ -72,10 +77,10 @@ def __repr__(self):
tag = self.tag
if self.is_tail:
tag = f'{tag}#'
ns = '-ns' if self.non_separating else ''
tag = f'{tag}{ns}'
if self.text:
tag = f'{tag}+text[{len(self.text)}]'
if self.is_annotation:
tag = f'{tag}@'
return tag

@property
Expand All @@ -92,9 +97,9 @@ def remove_brackets_without_words(text: str) -> str:
changed = True
previous_text = text
while changed:
fixed = re.sub(r"\([^\w\t]*\)", "", previous_text)
fixed = re.sub(r"\[[^\w\t]*\]", "", fixed)
fixed = re.sub(r"\{[^\w\t]*\}", "", fixed)
fixed = re.sub(r"\([^\w\t-]*\)", "", previous_text)
fixed = re.sub(r"\[[^\w\t-]*\]", "", fixed)
fixed = re.sub(r"\{[^\w\t-]*\}", "", fixed)
changed = bool(previous_text != fixed)
previous_text = fixed
return fixed
Expand All @@ -115,22 +120,30 @@ def cleanup_text(text: str) -> str:
"""
# Remove some "control-like" characters (left/right separator)
text = text.replace(u"\u2028", " ").replace(u"\u2029", " ")
text = text.replace('°', ' ° ')
text = "".join(ch for ch in text if unicodedata.category(ch)[0] != "C" or ch == TABLE_DELIMITER)
text = "".join(ch if unicodedata.category(ch)[0] != "Z" else " " for ch in text)

# replace greek letters with their long-form equivalent
for greek_letter, replacement in GREEK_ALPHABET.items():
text = text.replace(greek_letter, replacement)

text = unidecode(text, errors='preserve')

# Remove repeated commands and commas next to periods
text = re.sub(r",([^\S\t]*,)*", ",", text)
text = re.sub(r"(,[^\S\t]*)*\.", ".", text)
text = remove_brackets_without_words(text)

# remove extra spaces from in-text figute/table citations
# remove extra spaces from in-text figure/table citations
text = re.sub(r'\([^\S\t]*([^)]*[^\s)])[^\S\t]*\)', r'(\1)', text)

# remove trailing spaces before periods
text = re.sub(r'[^\S\t]+\.(\s|$)', r'.\1', text)

# remove extra spaces around commas/semi-colons
text = re.sub(r'[^\S\t]*([,;])[^\S\t]+', r'\1 ', text)
text = re.sub(r'[^\S\t]*([,;:])([^\S\t]+)', r'\1 ', text)
text = re.sub(r'[^\S\t]*([,;:])$', r'\1', text)

# trim leading and trailing non tab whitespace
text = re.sub(r'(^|\t)([^\S\t]+)', r'\1', text)
Expand Down Expand Up @@ -168,17 +181,14 @@ def merge_adjacent_xref_siblings(elem_list):
If two XML elements in a list are adjacent and both xrefs separated only by punctuation, merge them
"""
siblings = []

for elem in elem_list:
if siblings and elem.tag == 'xref' and siblings[-1].tag == 'xref':
# merge these 2 if the tail of the first element is a punctuation mark
prev_tail = (siblings[-1].tail or '').strip()
if (
siblings[-1].tail
and len(prev_tail) == 1
and unicodedata.category(prev_tail)[0] == 'P'
and elem.attrib.get('ref-type') == siblings[-1].attrib.get('ref-type')
):
not prev_tail
or (len(prev_tail) == 1 and unicodedata.category(prev_tail[0])[0] == 'P')
) and elem.attrib.get('ref-type') == siblings[-1].attrib.get('ref-type'):

siblings[-1].text = (siblings[-1].text or '') + prev_tail + (elem.text or '')
siblings[-1].tail = elem.tail
Expand All @@ -189,7 +199,7 @@ def merge_adjacent_xref_siblings(elem_list):

def get_tag_path(mapping: Dict[etree.Element, etree.Element], node: etree.Element) -> str:
"""
Get a string representing the path of the currentl XML node in the heirachry of the XML file
Get a string representing the path of the current XML node in the hierarchy of the XML file
"""
path = []
current_node = node
Expand Down Expand Up @@ -217,8 +227,8 @@ def tag_handler(
except NotImplementedError:
pass
# Extract any raw text directly in XML element or just after
head = (elem.text or "").strip()
tail = (elem.tail or "").strip()
head = elem.text or ""
tail = elem.tail or ""

# Then get the text from all child XML nodes recursively
child_passages = []
Expand Down Expand Up @@ -246,16 +256,14 @@ def tag_handler(
):
# Check if the tag should be ignored (so don't use main contents)
return [
TextChunk(tail, elem, non_separating=True, is_tail=True),
TextChunk(tail, elem, is_tail=True),
]

return [TextChunk(head, elem)] + child_passages + [TextChunk(tail, elem, is_tail=True)]


def strip_annotation_markers(
text: str,
annotations_map: Dict[str, str],
marker_pattern=r'ANN_[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}',
text: str, annotations_map: Dict[str, str], marker_pattern=ANNOTATION_MARKER_PATTERN
) -> Tuple[str, List[bioc.BioCAnnotation]]:
"""
Given a set of annotations, remove any which are found in the current text and return
Expand All @@ -271,7 +279,9 @@ def strip_annotation_markers(
transformed_text = text

pattern = (
r'([^\S\t]*)([\(\[\{][^\S\t]*)?(' + marker_pattern + r')([^\S\t]*[\)\]\}])?([^\S\t]*)(\.)?'
r'([^\S\t]*)([\(\[\{][^\S\t]*)?('
+ marker_pattern
+ r')([^\S\t]*[\)\]\}])?([^\S\t]*)([\.,;:])?'
)

matched_annotations: List[Tuple[int, int, str]] = []
Expand All @@ -286,13 +296,17 @@ def strip_annotation_markers(
end_offset = 0

matched_brackets = (
br_open and br_close and br_open.strip() + br_close.strip() in {'\{\}', '[]', '()'}
br_open and br_close and br_open.strip() + br_close.strip() in {r'{}', '[]', '()'}
)

if not matched_brackets and (br_open or br_close):
# do not include in the sequence to be removed from the text
start_offset += len(ws_start or '') + len(br_open or '')
end_offset += len(period or '') + len(ws_end or '') + len(br_close or '')
if br_open:
start_offset += len(ws_start or '') + len(br_open or '')
# end_offset += len(period or '') + len(ws_end or '') + len(br_close or '')
else:
# start_offset += len(ws_start or '') + len(br_open or '')
end_offset += len(period or '') + len(ws_end or '') + len(br_close or '')
elif not period:
if ws_end:
end_offset += len(ws_end)
Expand Down Expand Up @@ -327,7 +341,41 @@ def strip_annotation_markers(
return transformed_text, transformed_annotations


def merge_text_chunks(chunk_list, annotations_map=None) -> TextChunk:
def remove_style_tags(chunk_list_in: List[TextChunk], style_tags=['italic', 'bold', 'emph']):
"""
Given some list of text chunks, simplify the list to remove consecutive style-only tags
"""
if len(chunk_list_in) < 4:
return chunk_list_in

start_index = 1
chunk_list = chunk_list_in[:]

while start_index < len(chunk_list) - 2:
current_tag = chunk_list[start_index]

if current_tag.tag not in style_tags or current_tag.is_tail:
start_index += 1
continue

closing_tag = chunk_list[start_index + 1]

if closing_tag.tag != current_tag.tag or not closing_tag.is_tail:
start_index += 1
continue

chunk_list[start_index - 1] = copy(chunk_list[start_index - 1])
chunk_list[start_index - 1].text = (
chunk_list[start_index - 1].text + current_tag.text + closing_tag.text
)
chunk_list = chunk_list[:start_index] + chunk_list[start_index + 2 :]

length_diff = sum([len(c.text) for c in chunk_list_in]) - sum([len(c.text) for c in chunk_list])
assert length_diff == 0, f'characters changed {length_diff}'
return chunk_list


def merge_text_chunks(chunk_list: List[TextChunk], annotations_map=None) -> TextChunk:
"""
Merge some list of text chunks and pick the most top-level xml node associated with the list to be the new node for the chunk
Expand All @@ -336,40 +384,48 @@ def merge_text_chunks(chunk_list, annotations_map=None) -> TextChunk:
if annotations_map is None:
# if no mapping is expected, simply drop annotation chunks
chunk_list = [c for c in chunk_list if not c.is_annotation]

chunk_list = remove_style_tags(chunk_list)
merge = []

for i, current_chunk in enumerate(chunk_list):
if i > 0:
previous_chunk = chunk_list[i - 1]
join_char = ' '
join_char = ''
tags = {previous_chunk.tag, current_chunk.tag}
if any(
[
previous_chunk.is_annotation,
current_chunk.is_annotation,
previous_chunk.non_separating,
current_chunk.non_separating,
current_chunk.is_tail and not (current_chunk.text or previous_chunk.text),
]
if current_chunk.tag == 'sup':
if not current_chunk.is_tail:
if re.match(r'^\s*(−|-)?\d+\s*$', current_chunk.text):
if (
previous_chunk.text
and unicodedata.category(previous_chunk.text[-1])[0] != 'P'
):
join_char = '^'
elif (
current_chunk.text
and previous_chunk.text
and unicodedata.category(current_chunk.text[0])[0]
== unicodedata.category(previous_chunk.text[-1])[0]
):
join_char = '-'
elif current_chunk.tag in PSEUDO_SPACE_TAGS or (
current_chunk.tag == 'xref' and not current_chunk.is_annotation
):
join_char = ''
join_char = ' '
elif len(tags) == 1 and tags & TABLE_DELIMITED_TAGS and not current_chunk.is_tail:
join_char = TABLE_DELIMITER

merge.append(join_char)

current_text = cleanup_text(current_chunk.text)
if current_chunk.is_annotation:
ann_id = f'ANN_{uuid.uuid4()}'
annotations_map[ann_id] = current_text
annotations_map[ann_id] = current_chunk.text
merge.append(ann_id)
else:
merge.append(current_text)
merge.append(current_chunk.text)

text = ''.join(merge)
# Remove any newlines (as they can be trusted to be syntactically important)
text = text.replace('\n', '')
text = text.replace('\n', ' ')
text = cleanup_text(text)

first_non_tail_node = chunk_list[0].xml_node
Expand Down Expand Up @@ -399,6 +455,7 @@ def extract_text_chunks(
"""
if not isinstance(element_list, list):
element_list = [element_list]

raw_text_chunks = []
for elem in element_list:
raw_text_chunks.extend(tag_handler(elem, tag_handlers))
Expand Down
Loading

0 comments on commit 8c5991c

Please sign in to comment.