Add parsing updates

Deals with several edge-cases to create a more human-readable output that matches the input more as expected by the reader resolves: #16
jakelever · Mar 8, 2023 · 8c5991c · 8c5991c
1 parent 19cce75
commit 8c5991c
Show file tree

Hide file tree

Showing 4 changed files with 334 additions and 57 deletions.
diff --git a/.gitignore b/.gitignore
@@ -33,3 +33,4 @@ snakejob.*
 
 listings/
 dist
+debug
diff --git a/src/bioconverters/constants.py b/src/bioconverters/constants.py
@@ -0,0 +1,50 @@
+GREEK_ALPHABET = {
+    '\u0391': 'Alpha',
+    '\u0392': 'Beta',
+    '\u0393': 'Gamma',
+    '\u0394': 'Delta',
+    '\u0395': 'Epsilon',
+    '\u0396': 'Zeta',
+    '\u0397': 'Eta',
+    '\u0398': 'Theta',
+    '\u0399': 'Iota',
+    '\u039A': 'Kappa',
+    '\u039B': 'Lambda',
+    '\u039C': 'Mu',
+    '\u039D': 'Nu',
+    '\u039E': 'Xi',
+    '\u039F': 'Omicron',
+    '\u03A0': 'Pi',
+    '\u03A1': 'Rho',
+    '\u03A3': 'Sigma',
+    '\u03A4': 'Tau',
+    '\u03A5': 'Upsilon',
+    '\u03A6': 'Phi',
+    '\u03A7': 'Chi',
+    '\u03A8': 'Psi',
+    '\u03A9': 'Omega',
+    '\u03B1': 'alpha',
+    '\u03B2': 'beta',
+    '\u03B3': 'gamma',
+    '\u03B4': 'delta',
+    '\u03B5': 'epsilon',
+    '\u03B6': 'zeta',
+    '\u03B7': 'eta',
+    '\u03B8': 'theta',
+    '\u03B9': 'iota',
+    '\u03BA': 'kappa',
+    '\u03BB': 'lambda',
+    '\u03BC': 'mu',
+    '\u03BD': 'nu',
+    '\u03BE': 'xi',
+    '\u03BF': 'omicron',
+    '\u03C0': 'pi',
+    '\u03C1': 'rho',
+    '\u03C3': 'sigma',
+    '\u03C4': 'tau',
+    '\u03C5': 'upsilon',
+    '\u03C6': 'phi',
+    '\u03C7': 'chi',
+    '\u03C8': 'psi',
+    '\u03C9': 'omega',
+}
diff --git a/src/bioconverters/utils.py b/src/bioconverters/utils.py
@@ -2,9 +2,13 @@
 import unicodedata
 import uuid
 import xml.etree.cElementTree as etree
+from copy import copy
 from typing import Callable, Dict, Iterable, List, Optional, Tuple
 
 import bioc
+from unidecode import unidecode
+
+from .constants import GREEK_ALPHABET
 
 # XML elements to ignore the contents of
 IGNORE_LIST = [
@@ -19,7 +23,7 @@
     "tex-math",
     "mml:math",
     "object-id",
-    "ext-link",
+    "ext-link",  # TODO: should we keep URL content? some of these have text rather than the URL as inner content
 ]
 
 # XML elements to separate text between (into different passages)
@@ -34,15 +38,18 @@
     "label",
 ]
 
+
 TABLE_DELIMITER = '\t'
 TABLE_DELIMITED_TAGS = {'tr', 'th', 'td'}
+# Tags that should be pre-pended with a space on merge
+PSEUDO_SPACE_TAGS = {'sup', 'break'}
+ANNOTATION_MARKER_PATTERN = r'ANN_[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}'
 
 
 class TextChunk:
     text: str
     xml_node: str
     xml_path: str
-    non_separating: bool = False
     is_tail: bool = False
     is_annotation: bool = False
 
@@ -51,14 +58,12 @@ def __init__(
         text,
         xml_node,
         xml_path=None,
-        non_separating=False,
         is_tail=False,
         is_annotation=False,
     ):
         self.text = text
         self.xml_node = xml_node
         self.xml_path = xml_path
-        self.non_separating = non_separating or is_annotation
         self.is_tail = is_tail
         self.is_annotation = is_annotation
 
@@ -72,10 +77,10 @@ def __repr__(self):
         tag = self.tag
         if self.is_tail:
             tag = f'{tag}#'
-        ns = '-ns' if self.non_separating else ''
-        tag = f'{tag}{ns}'
         if self.text:
             tag = f'{tag}+text[{len(self.text)}]'
+        if self.is_annotation:
+            tag = f'{tag}@'
         return tag
 
     @property
@@ -92,9 +97,9 @@ def remove_brackets_without_words(text: str) -> str:
     changed = True
     previous_text = text
     while changed:
-        fixed = re.sub(r"\([^\w\t]*\)", "", previous_text)
-        fixed = re.sub(r"\[[^\w\t]*\]", "", fixed)
-        fixed = re.sub(r"\{[^\w\t]*\}", "", fixed)
+        fixed = re.sub(r"\([^\w\t-]*\)", "", previous_text)
+        fixed = re.sub(r"\[[^\w\t-]*\]", "", fixed)
+        fixed = re.sub(r"\{[^\w\t-]*\}", "", fixed)
         changed = bool(previous_text != fixed)
         previous_text = fixed
     return fixed
@@ -115,22 +120,30 @@ def cleanup_text(text: str) -> str:
     """
     # Remove some "control-like" characters (left/right separator)
     text = text.replace(u"\u2028", " ").replace(u"\u2029", " ")
+    text = text.replace('°', ' ° ')
     text = "".join(ch for ch in text if unicodedata.category(ch)[0] != "C" or ch == TABLE_DELIMITER)
     text = "".join(ch if unicodedata.category(ch)[0] != "Z" else " " for ch in text)
 
+    # replace greek letters with their long-form equivalent
+    for greek_letter, replacement in GREEK_ALPHABET.items():
+        text = text.replace(greek_letter, replacement)
+
+    text = unidecode(text, errors='preserve')
+
     # Remove repeated commands and commas next to periods
     text = re.sub(r",([^\S\t]*,)*", ",", text)
     text = re.sub(r"(,[^\S\t]*)*\.", ".", text)
     text = remove_brackets_without_words(text)
 
-    # remove extra spaces from in-text figute/table citations
+    # remove extra spaces from in-text figure/table citations
     text = re.sub(r'\([^\S\t]*([^)]*[^\s)])[^\S\t]*\)', r'(\1)', text)
 
     # remove trailing spaces before periods
     text = re.sub(r'[^\S\t]+\.(\s|$)', r'.\1', text)
 
     # remove extra spaces around commas/semi-colons
-    text = re.sub(r'[^\S\t]*([,;])[^\S\t]+', r'\1 ', text)
+    text = re.sub(r'[^\S\t]*([,;:])([^\S\t]+)', r'\1 ', text)
+    text = re.sub(r'[^\S\t]*([,;:])$', r'\1', text)
 
     # trim leading and trailing non tab whitespace
     text = re.sub(r'(^|\t)([^\S\t]+)', r'\1', text)
@@ -168,17 +181,14 @@ def merge_adjacent_xref_siblings(elem_list):
     If two XML elements in a list are adjacent and both xrefs separated only by punctuation, merge them
     """
     siblings = []
-
     for elem in elem_list:
         if siblings and elem.tag == 'xref' and siblings[-1].tag == 'xref':
             # merge these 2 if the tail of the first element is a punctuation mark
             prev_tail = (siblings[-1].tail or '').strip()
             if (
-                siblings[-1].tail
-                and len(prev_tail) == 1
-                and unicodedata.category(prev_tail)[0] == 'P'
-                and elem.attrib.get('ref-type') == siblings[-1].attrib.get('ref-type')
-            ):
+                not prev_tail
+                or (len(prev_tail) == 1 and unicodedata.category(prev_tail[0])[0] == 'P')
+            ) and elem.attrib.get('ref-type') == siblings[-1].attrib.get('ref-type'):
 
                 siblings[-1].text = (siblings[-1].text or '') + prev_tail + (elem.text or '')
                 siblings[-1].tail = elem.tail
@@ -189,7 +199,7 @@ def merge_adjacent_xref_siblings(elem_list):
 
 def get_tag_path(mapping: Dict[etree.Element, etree.Element], node: etree.Element) -> str:
     """
-    Get a string representing the path of the currentl XML node in the heirachry of the XML file
+    Get a string representing the path of the current XML node in the hierarchy of the XML file
     """
     path = []
     current_node = node
@@ -217,8 +227,8 @@ def tag_handler(
         except NotImplementedError:
             pass
     # Extract any raw text directly in XML element or just after
-    head = (elem.text or "").strip()
-    tail = (elem.tail or "").strip()
+    head = elem.text or ""
+    tail = elem.tail or ""
 
     # Then get the text from all child XML nodes recursively
     child_passages = []
@@ -246,16 +256,14 @@ def tag_handler(
         ):
             # Check if the tag should be ignored (so don't use main contents)
             return [
-                TextChunk(tail, elem, non_separating=True, is_tail=True),
+                TextChunk(tail, elem, is_tail=True),
             ]
 
     return [TextChunk(head, elem)] + child_passages + [TextChunk(tail, elem, is_tail=True)]
 
 
 def strip_annotation_markers(
-    text: str,
-    annotations_map: Dict[str, str],
-    marker_pattern=r'ANN_[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}',
+    text: str, annotations_map: Dict[str, str], marker_pattern=ANNOTATION_MARKER_PATTERN
 ) -> Tuple[str, List[bioc.BioCAnnotation]]:
     """
     Given a set of annotations, remove any which are found in the current text and return
@@ -271,7 +279,9 @@ def strip_annotation_markers(
     transformed_text = text
 
     pattern = (
-        r'([^\S\t]*)([\(\[\{][^\S\t]*)?(' + marker_pattern + r')([^\S\t]*[\)\]\}])?([^\S\t]*)(\.)?'
+        r'([^\S\t]*)([\(\[\{][^\S\t]*)?('
+        + marker_pattern
+        + r')([^\S\t]*[\)\]\}])?([^\S\t]*)([\.,;:])?'
     )
 
     matched_annotations: List[Tuple[int, int, str]] = []
@@ -286,13 +296,17 @@ def strip_annotation_markers(
         end_offset = 0
 
         matched_brackets = (
-            br_open and br_close and br_open.strip() + br_close.strip() in {'\{\}', '[]', '()'}
+            br_open and br_close and br_open.strip() + br_close.strip() in {r'{}', '[]', '()'}
         )
 
         if not matched_brackets and (br_open or br_close):
             # do not include in the sequence to be removed from the text
-            start_offset += len(ws_start or '') + len(br_open or '')
-            end_offset += len(period or '') + len(ws_end or '') + len(br_close or '')
+            if br_open:
+                start_offset += len(ws_start or '') + len(br_open or '')
+                # end_offset += len(period or '') + len(ws_end or '') + len(br_close or '')
+            else:
+                # start_offset += len(ws_start or '') + len(br_open or '')
+                end_offset += len(period or '') + len(ws_end or '') + len(br_close or '')
         elif not period:
             if ws_end:
                 end_offset += len(ws_end)
@@ -327,7 +341,41 @@ def strip_annotation_markers(
     return transformed_text, transformed_annotations
 
 
-def merge_text_chunks(chunk_list, annotations_map=None) -> TextChunk:
+def remove_style_tags(chunk_list_in: List[TextChunk], style_tags=['italic', 'bold', 'emph']):
+    """
+    Given some list of text chunks, simplify the list to remove consecutive style-only tags
+    """
+    if len(chunk_list_in) < 4:
+        return chunk_list_in
+
+    start_index = 1
+    chunk_list = chunk_list_in[:]
+
+    while start_index < len(chunk_list) - 2:
+        current_tag = chunk_list[start_index]
+
+        if current_tag.tag not in style_tags or current_tag.is_tail:
+            start_index += 1
+            continue
+
+        closing_tag = chunk_list[start_index + 1]
+
+        if closing_tag.tag != current_tag.tag or not closing_tag.is_tail:
+            start_index += 1
+            continue
+
+        chunk_list[start_index - 1] = copy(chunk_list[start_index - 1])
+        chunk_list[start_index - 1].text = (
+            chunk_list[start_index - 1].text + current_tag.text + closing_tag.text
+        )
+        chunk_list = chunk_list[:start_index] + chunk_list[start_index + 2 :]
+
+    length_diff = sum([len(c.text) for c in chunk_list_in]) - sum([len(c.text) for c in chunk_list])
+    assert length_diff == 0, f'characters changed {length_diff}'
+    return chunk_list
+
+
+def merge_text_chunks(chunk_list: List[TextChunk], annotations_map=None) -> TextChunk:
     """
     Merge some list of text chunks and pick the most top-level xml node associated with the list to be the new node for the chunk
 
@@ -336,40 +384,48 @@ def merge_text_chunks(chunk_list, annotations_map=None) -> TextChunk:
     if annotations_map is None:
         # if no mapping is expected, simply drop annotation chunks
         chunk_list = [c for c in chunk_list if not c.is_annotation]
-
+    chunk_list = remove_style_tags(chunk_list)
     merge = []
 
     for i, current_chunk in enumerate(chunk_list):
         if i > 0:
             previous_chunk = chunk_list[i - 1]
-            join_char = ' '
+            join_char = ''
             tags = {previous_chunk.tag, current_chunk.tag}
-            if any(
-                [
-                    previous_chunk.is_annotation,
-                    current_chunk.is_annotation,
-                    previous_chunk.non_separating,
-                    current_chunk.non_separating,
-                    current_chunk.is_tail and not (current_chunk.text or previous_chunk.text),
-                ]
+            if current_chunk.tag == 'sup':
+                if not current_chunk.is_tail:
+                    if re.match(r'^\s*(−|-)?\d+\s*$', current_chunk.text):
+                        if (
+                            previous_chunk.text
+                            and unicodedata.category(previous_chunk.text[-1])[0] != 'P'
+                        ):
+                            join_char = '^'
+                    elif (
+                        current_chunk.text
+                        and previous_chunk.text
+                        and unicodedata.category(current_chunk.text[0])[0]
+                        == unicodedata.category(previous_chunk.text[-1])[0]
+                    ):
+                        join_char = '-'
+            elif current_chunk.tag in PSEUDO_SPACE_TAGS or (
+                current_chunk.tag == 'xref' and not current_chunk.is_annotation
             ):
-                join_char = ''
+                join_char = ' '
             elif len(tags) == 1 and tags & TABLE_DELIMITED_TAGS and not current_chunk.is_tail:
                 join_char = TABLE_DELIMITER
 
             merge.append(join_char)
 
-        current_text = cleanup_text(current_chunk.text)
         if current_chunk.is_annotation:
             ann_id = f'ANN_{uuid.uuid4()}'
-            annotations_map[ann_id] = current_text
+            annotations_map[ann_id] = current_chunk.text
             merge.append(ann_id)
         else:
-            merge.append(current_text)
+            merge.append(current_chunk.text)
 
     text = ''.join(merge)
     # Remove any newlines (as they can be trusted to be syntactically important)
-    text = text.replace('\n', '')
+    text = text.replace('\n', ' ')
     text = cleanup_text(text)
 
     first_non_tail_node = chunk_list[0].xml_node
@@ -399,6 +455,7 @@ def extract_text_chunks(
     """
     if not isinstance(element_list, list):
         element_list = [element_list]
+
     raw_text_chunks = []
     for elem in element_list:
         raw_text_chunks.extend(tag_handler(elem, tag_handlers))