Chinese numerals (#97)

* WIP Parse Chinese numerals. * WIP complete number parsing. * Complete Chinese numerals: * Use standard table override instead of pre-config hooks. * Add few test strings. * Complete numerals: * Transliterate all numeric examples correctly * Modify hook return logic for consistency * WIP partial spacing fix. * Some cleanup; upgrade docker OS. * Add dependency for uwsgi.
lcnetdev · Apr 19, 2024 · 8c7ce0e · 8c7ce0e
1 parent 30859a5
commit 8c7ce0e
Show file tree

Hide file tree

Showing 10 changed files with 46,047 additions and 45,661 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -1,7 +1,7 @@
-FROM python:3.10-slim-bullseye
+FROM python:3.10-slim-bookworm
 
 RUN apt update
-RUN apt install -y build-essential tzdata gfortran libopenblas-dev libboost-all-dev
+RUN apt install -y build-essential tzdata gfortran libopenblas-dev libboost-all-dev libpcre2-dev
 
 ENV TZ=America/New_York
 ENV _workroot "/usr/local/scriptshifter/src"

diff --git a/doc/hooks.md b/doc/hooks.md
@@ -333,7 +333,7 @@ and return it before any further default processing is done.
 
 #### Output
 
-`"ret"` or `None`. If `"ret"`, the transliteration function returns `ctx.dest`
+String or `None`. If a string, the transliteration function returns that
 immediately; otherwise it proceeds with standard adjustments of the output
 string before returning.
 

diff --git a/scriptshifter/hooks/chinese/__init__.py b/scriptshifter/hooks/chinese/__init__.py
@@ -0,0 +1,131 @@
+__doc__ = """Chinese hooks."""
+
+
+from logging import getLogger
+from re import I, compile, search, sub
+
+from scriptshifter.hooks.general import normalize_spacing_post_assembly
+
+
+logger = getLogger(__name__)
+
+
+def parse_numerals_pre_assembly(ctx):
+    """
+    Parse Chinese numerals in the already romanized result.
+
+    This is run at post-assembly.
+    """
+    # Only apply to specific MARC fields.
+    use_num_v = ctx.options.get("marc_field") in ("245n", "830n")
+
+    # tokens = split(r"[\W^#]", ctx.dest)  # Original logic.
+    tk_ct = len(ctx.dest_ls)
+    token_ptn = compile(r"^([A-Za-z]+)#([0-9]*)(\s*)$")
+
+    output = ""
+
+    # Use manual loop as i is manipulated inside it.
+    i = 0
+
+    while i < tk_ct:
+        tk_i = ctx.dest_ls[i]
+        if search(token_ptn, tk_i):
+            # When a numerical token (containing #) is reached, the inner loop
+            # consumes it and all consecutive numerical tokens found after it.
+            # Two versions of the string are maintained. The textVersion is
+            # the original pinyin (minus the # suffixes). In the numVersion,
+            # characters representing numbers are converted to Arabic
+            # numerals. When a non-numerical token (or end of string) is
+            # encountered, the string of numerical tokens is evaluated to
+            # determine which version should be used in the output string.
+            # The outer loop then continues where the inner loop left off.
+            logger.debug(f"Match number: {tk_i}.")
+            text_v = num_v = ""
+            for j in range(i, tk_ct):
+                tk_j = ctx.dest_ls[j]
+                m = search(token_ptn, tk_j)
+                # if m:
+                #     logger.debug(f"m[1]: {m[1]} - m[2]: {m[2]}")
+                # a token without # (or the end of string) is reached
+                if not m or j == tk_ct - 1:
+                    logger.debug(f"Next token is not numeric: {tk_j}")
+                    # If this runs, then we are on the last token and it is
+                    # numeric. Add text after # (if present) to numerical
+                    # version and captured whitespace after the number.
+                    if m:
+                        text_v += m[1] + m[3]
+                        num_v += m[2] + m[3] if len(m[2]) else m[1] + m[3]
+                        # Append white space.
+                        num_v += " "
+                    elif j == tk_ct - 1:
+                        # if last token is non-numerical, just tack it on.
+                        logger.debug(f"Last token is non-numerical: {tk_j}")
+                        text_v += tk_j
+                        num_v += tk_j
+                    # evaluate numerical string that has been constructed so
+                    # far. Use num version for ordinals and date strings
+                    if (
+                        search("^di [0-9]", num_v, flags=I) or
+                        search("[0-9] [0-9] [0-9] [0-9]", num_v) or
+                        search("[0-9]+ nian [0-9]+ yue", num_v, flags=I) or
+                        search("[0-9]+ yue [0-9]+ ri", num_v, flags=I)
+                    ):
+                        use_num_v = True
+                        # At this point, string may contain literal
+                        # translations of Chinese numerals Convert these to
+                        # Arabic numerals (for example "2 10 7" = "27").
+                        mult_ptn = compile(r"(\b[0-9]) ([1-9]0+)")
+                        sum_ptn = compile("([1-9]0+) ([0-9]+)")
+                        while _m := search("[0-9] 10+|[1-9]0+ [1-9]", num_v):
+                            logger.debug(f"Match number combination: {_m}")
+                            if m := mult_ptn.search(num_v):
+                                logger.debug(f"Multiply: {m[1]}, {m[2]}")
+                                parsed = int(m[1]) * int(m[2])
+                                num_v = mult_ptn.sub(str(parsed), num_v, 1)
+                            elif m := sum_ptn.search(num_v):
+                                logger.debug(f"Add: {m[1]}, {m[2]}")
+                                parsed = int(m[1]) + int(m[2])
+                                num_v = sum_ptn.sub(str(parsed), num_v, 1)
+                            else:
+                                break
+                        # A few other tweaks
+                        num_v = sub(
+                                "([0-9]) ([0-9]) ([0-9]) ([0-9])",
+                                r"\1\2\3\4", num_v)
+                        if ctx.options.get("marc_field") in ("245", "830"):
+                            # TODO optimize without loop.
+                            while search("[0-9] [0-9]", num_v):
+                                num_v = sub("([0-9]) ([0-9])", r"\1\2", num_v)
+
+                    output += num_v if use_num_v else text_v
+
+                    # if the end of the string is not reached, backtrack to the
+                    # delimiter after the last numerical token (i.e. two tokens
+                    # ago).
+                    #
+                    # Else, we are at the end of the string, so we are done!
+                    i = j - 1 if j < tk_ct - 1 else j
+                    break
+
+                # this is run when we are not yet at the end of the string and
+                # have not yet reached a non-numerical token. This is identical
+                # to the code that is run above when the last token is numeric,
+                # except that whitespace after the token is stripped.
+                m = search(token_ptn, tk_j)
+                text_v += m[1] + " "
+                num_v += m[2] if len(m[2]) else m[1]
+                num_v += " "
+
+        else:
+            logger.debug(f"No numeric match: adding {tk_i}.")
+            output += tk_i
+
+        i += 1
+
+    logger.debug(f"Use num version: {use_num_v}")
+    ctx.dest = output
+
+    # Skip main transliterate function joining.
+
+    return normalize_spacing_post_assembly(ctx)
diff --git a/scriptshifter/hooks/chinese/processNumbers.ts b/scriptshifter/hooks/chinese/processNumbers.ts
@@ -0,0 +1,144 @@
+private processNumbers(pinyinString: string, tag: string, code: string): string {
+    let outputString = "";
+    let useNumVersion = false;
+    //useNumVersion is set in specific subfields where we definitely want to treat numbers as numbers
+    if ((tag == "245" || tag == "830") && code == "n") {
+       useNumVersion = true;
+    }
+
+    /*
+     * The input string is split, with any space or punctuation character (except for #) as the delimiter.
+     * The delimiters will be captured and included in the string of tokens.  Only the even-numbered
+     * array elements are the true 'tokens', so the code for processing tokens is run only for even
+     * values of j.
+     */
+    let tokens: string[] = pinyinString.split(new RegExp("([^\\P{P}#]|\\s)","u"));
+    let numTokenPattern = "^([A-Za-z]+)#([0-9]*)$";
+    let numToken_re = new RegExp(numTokenPattern);
+    let n = tokens.length
+    //this.alert.info(tokens.join("|"),{autoClose: false})
+    for (let i = 0; i < n; i++) {
+        let toki = tokens[i];
+        if (toki.match(numToken_re)) {
+            /*
+             * When a numerical token (containing #) is reached, the inner loop consumes it and all consecutive numerical tokens
+             * found after it.  Two versions of the string are maintained.  The textVersion is the original pinyin (minus the
+             * # suffixes).  In the numVersion, characters representing numbers are converted to Arabic numerals.  When a
+             * non-numerical token (or end of string) is encountered, the string of numerical tokens is evaluated to determine
+             * which version should be used in the output string.  The outer loop then continues where the inner loop left off.
+             */
+            let textVersion = "";
+            let numVersion = "";
+            for (let j = i; j < n; j++) {
+                let tokj = tokens[j];
+                /* a token without # (or the end of string) is reached */
+                if ((j % 2 == 0 && !tokj.match(numToken_re)) || j == n - 1) {
+                    //If this runs, then we are on the last token and it is numeric. Add text after # (if present) to numerical version
+                    let m = tokj.match(numToken_re);
+                    if (m) {
+                        textVersion += m[1]
+                        if (m[2] == "") {
+                            numVersion += m[1];
+                        } else {
+                            numVersion += m[2];
+                        }
+                    } else if (j == n - 1) {
+                    //if last token is non-numerical, just tack it on.
+                        textVersion += tokj;
+                        numVersion += tokj;
+                    } else if (textVersion.length > 0 && numVersion.length > 0) {
+                    //if not at end of string yet and token is non-numerical, remove the last delimiter that was appended
+                    //(outer loop will pick up at this point)
+                        textVersion = textVersion.substring(0, textVersion.length - 1);
+                        numVersion = numVersion.substring(0, numVersion.length - 1);
+                    }
+                    //evaluate numerical string that has been constructed so far
+                    //use num version for ordinals and date strings
+                    if (numVersion.match(/^di [0-9]/i) ||
+                        numVersion.match(/[0-9] [0-9] [0-9] [0-9]/) ||
+                        numVersion.match(/[0-9]+ nian [0-9]+ yue/i) ||
+                        numVersion.match(/"[0-9]+ yue [0-9]+ ri/i) ||
+                        useNumVersion
+                       ) {
+                        useNumVersion = true;
+                        /*
+                         * At this point, string may contain literal translations of Chinese numerals
+                         * Convert these to Arabic numerals (for example "2 10 7" = "27").
+                         */
+
+                        while (numVersion.match(/[0-9] 10+/) || numVersion.match(/[1-9]0+ [1-9]/)) {
+                            m = numVersion.match(/([0-9]+) ([1-9]0+)/);
+                            if (m) {
+                                let sum = Number(m[1]) * Number(m[2]);
+                                numVersion = numVersion.replace(/[0-9]+ [1-9]0+/, String(sum));
+                            } else {
+                                let mb = numVersion.match(/([1-9]0+) ([0-9]+)/);
+                                if (mb)
+                                {
+                                    let sumb = Number(mb[1]) + Number(mb[2]);
+                                    numVersion = numVersion.replace(/[1-9]0+ [0-9]+/, String(sumb));
+                                }
+                                else
+                                {
+                                    break;
+                                }
+                            }
+                        }
+
+                        //A few other tweaks
+                        numVersion = numVersion.replace(/([0-9]) ([0-9]) ([0-9]) ([0-9])/g, "$1$2$3$4");
+                        if ((tag == "245" || tag == "830") && code == "n") {
+                            while (numVersion.match(/[0-9] [0-9]/)) {
+                                numVersion = numVersion.replace(/([0-9]) ([0-9])/, "$1$2");
+                            }
+                        }
+                    }
+                    if (useNumVersion)
+                    {
+                        outputString += numVersion;
+                    }
+                    else
+                    {
+                        outputString += textVersion;
+                    }
+                    //if the end of the string is not reached, backtrack to the delimiter after the last numerical token
+                    //(i.e. two tokens ago)
+                    if (j < n - 1)
+                    {
+                        i = j - 2;
+                    }
+                    else //we are at the end of the string, so we are done!
+                    {
+                        i = j;
+                    }
+                    break;
+                }
+                //this is run when we are not yet at the end of the string and have not yet reached a non-numerical token
+                //This is identical to the code that is run above when the last token is numeric.
+                if (j % 2 == 0)
+                {
+                    let m = tokj.match(numToken_re);
+                    textVersion += m[1];
+                    if (m[2]== "")
+                    {
+                        numVersion += m[1];
+                    }
+                    else
+                    {
+                        numVersion += m[2];
+                    }
+                }
+                else //a delimiter, just tack it on.
+                {
+                    textVersion += tokj;
+                    numVersion += tokj;
+                }
+            }
+        }
+        else // the outer loop has encountered a non-numeric token or delimiter, just tack it on.
+        {
+            outputString += toki;
+        }
+    }
+    return outputString;
+ }
diff --git a/scriptshifter/hooks/general/__init__.py b/scriptshifter/hooks/general/__init__.py
@@ -0,0 +1,31 @@
+__doc__ = """
+General-purpose hooks.
+"""
+
+from logging import getLogger
+from re import compile
+
+from scriptshifter.trans import MULTI_WS_RE
+
+
+NORM1_RE = compile(r"([.,;:\)\]}])\s")
+NORM2_RE = compile(r"(\S)([.,;:\)\]}])")
+NORM3_RE = compile(r"\s([\)\]\}])")
+NORM4_RE = compile(r"([\)\]\}])(\S)")
+
+logger = getLogger(__name__)
+
+
+def normalize_spacing_post_assembly(ctx):
+    """
+    Remove duplicate and unwanted whitespace around punctuation.
+    """
+    # De-duplicate whitespace.
+    logger.debug(f"Dest pre manipulation: {ctx.dest}")
+    norm = MULTI_WS_RE.sub(r"\1", ctx.dest.strip())
+    norm = NORM1_RE.sub(r"\1", norm)
+    norm = NORM2_RE.sub(r"\1 \2", norm)
+    norm = NORM3_RE.sub(r"\1", norm)
+    norm = NORM4_RE.sub(r"\1 \2", norm)
+
+    return norm
diff --git a/scriptshifter/tables/__init__.py b/scriptshifter/tables/__init__.py
@@ -11,7 +11,7 @@
 except ImportError:
     from yaml import Loader
 
-from scriptshifter.exceptions import ConfigError
+from scriptshifter.exceptions import BREAK, ConfigError
 
 
 __doc__ = """
@@ -126,7 +126,10 @@ def __hash__(self):
 @cache
 def list_tables():
     """
-    List all the available tables.
+    List all the indexed tables.
+
+    Note that this may not correspond to all the table files in the data
+    folder, but only those exposed in the index.
     """
     with open(path.join(TABLE_DIR, "index.yml")) as fh:
         tdata = load(fh, Loader=Loader)
@@ -150,7 +153,18 @@ def load_table(tname):
     with open(fname) as fh:
         tdata = load(fh, Loader=Loader)
 
-    # NOTE Only one level of inheritance. No need for recursion for now.
+    # Pre-config hooks.
+    # If any of these hooks returns BREAK, interrupt the configuration
+    # parsing and return whatever is obtained so far.
+    if "hooks" in tdata:
+        tdata["hooks"] = load_hook_fn(tname, tdata)
+    pre_cfg_hooks = tdata.get("hooks", {}).get("pre_config", [])
+    for hook_def in pre_cfg_hooks:
+        kwargs = hook_def[1] if len(hook_def) > 1 else {}
+        ret = hook_def[0](tdata, **kwargs)
+        if ret == BREAK:
+            return tdata
+
     parents = tdata.get("general", {}).get("parents", [])
 
     if "script_to_roman" in tdata: