Skip to content

Commit

Permalink
Chinese numerals (#97)
Browse files Browse the repository at this point in the history
* WIP Parse Chinese numerals.

* WIP complete number parsing.

* Complete Chinese numerals:

* Use standard table override instead of pre-config hooks.
* Add few test strings.

* Complete numerals:

* Transliterate all numeric examples correctly
* Modify hook return logic for consistency
* WIP partial spacing fix.

* Some cleanup; upgrade docker OS.

* Add dependency for uwsgi.
  • Loading branch information
scossu authored Apr 19, 2024
1 parent 30859a5 commit 8c7ce0e
Show file tree
Hide file tree
Showing 10 changed files with 46,047 additions and 45,661 deletions.
4 changes: 2 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
FROM python:3.10-slim-bullseye
FROM python:3.10-slim-bookworm

RUN apt update
RUN apt install -y build-essential tzdata gfortran libopenblas-dev libboost-all-dev
RUN apt install -y build-essential tzdata gfortran libopenblas-dev libboost-all-dev libpcre2-dev

ENV TZ=America/New_York
ENV _workroot "/usr/local/scriptshifter/src"
Expand Down
2 changes: 1 addition & 1 deletion doc/hooks.md
Original file line number Diff line number Diff line change
Expand Up @@ -333,7 +333,7 @@ and return it before any further default processing is done.

#### Output

`"ret"` or `None`. If `"ret"`, the transliteration function returns `ctx.dest`
String or `None`. If a string, the transliteration function returns that
immediately; otherwise it proceeds with standard adjustments of the output
string before returning.

Expand Down
131 changes: 131 additions & 0 deletions scriptshifter/hooks/chinese/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
__doc__ = """Chinese hooks."""


from logging import getLogger
from re import I, compile, search, sub

from scriptshifter.hooks.general import normalize_spacing_post_assembly


logger = getLogger(__name__)


def parse_numerals_pre_assembly(ctx):
"""
Parse Chinese numerals in the already romanized result.
This is run at post-assembly.
"""
# Only apply to specific MARC fields.
use_num_v = ctx.options.get("marc_field") in ("245n", "830n")

# tokens = split(r"[\W^#]", ctx.dest) # Original logic.
tk_ct = len(ctx.dest_ls)
token_ptn = compile(r"^([A-Za-z]+)#([0-9]*)(\s*)$")

output = ""

# Use manual loop as i is manipulated inside it.
i = 0

while i < tk_ct:
tk_i = ctx.dest_ls[i]
if search(token_ptn, tk_i):
# When a numerical token (containing #) is reached, the inner loop
# consumes it and all consecutive numerical tokens found after it.
# Two versions of the string are maintained. The textVersion is
# the original pinyin (minus the # suffixes). In the numVersion,
# characters representing numbers are converted to Arabic
# numerals. When a non-numerical token (or end of string) is
# encountered, the string of numerical tokens is evaluated to
# determine which version should be used in the output string.
# The outer loop then continues where the inner loop left off.
logger.debug(f"Match number: {tk_i}.")
text_v = num_v = ""
for j in range(i, tk_ct):
tk_j = ctx.dest_ls[j]
m = search(token_ptn, tk_j)
# if m:
# logger.debug(f"m[1]: {m[1]} - m[2]: {m[2]}")
# a token without # (or the end of string) is reached
if not m or j == tk_ct - 1:
logger.debug(f"Next token is not numeric: {tk_j}")
# If this runs, then we are on the last token and it is
# numeric. Add text after # (if present) to numerical
# version and captured whitespace after the number.
if m:
text_v += m[1] + m[3]
num_v += m[2] + m[3] if len(m[2]) else m[1] + m[3]
# Append white space.
num_v += " "
elif j == tk_ct - 1:
# if last token is non-numerical, just tack it on.
logger.debug(f"Last token is non-numerical: {tk_j}")
text_v += tk_j
num_v += tk_j
# evaluate numerical string that has been constructed so
# far. Use num version for ordinals and date strings
if (
search("^di [0-9]", num_v, flags=I) or
search("[0-9] [0-9] [0-9] [0-9]", num_v) or
search("[0-9]+ nian [0-9]+ yue", num_v, flags=I) or
search("[0-9]+ yue [0-9]+ ri", num_v, flags=I)
):
use_num_v = True
# At this point, string may contain literal
# translations of Chinese numerals Convert these to
# Arabic numerals (for example "2 10 7" = "27").
mult_ptn = compile(r"(\b[0-9]) ([1-9]0+)")
sum_ptn = compile("([1-9]0+) ([0-9]+)")
while _m := search("[0-9] 10+|[1-9]0+ [1-9]", num_v):
logger.debug(f"Match number combination: {_m}")
if m := mult_ptn.search(num_v):
logger.debug(f"Multiply: {m[1]}, {m[2]}")
parsed = int(m[1]) * int(m[2])
num_v = mult_ptn.sub(str(parsed), num_v, 1)
elif m := sum_ptn.search(num_v):
logger.debug(f"Add: {m[1]}, {m[2]}")
parsed = int(m[1]) + int(m[2])
num_v = sum_ptn.sub(str(parsed), num_v, 1)
else:
break
# A few other tweaks
num_v = sub(
"([0-9]) ([0-9]) ([0-9]) ([0-9])",
r"\1\2\3\4", num_v)
if ctx.options.get("marc_field") in ("245", "830"):
# TODO optimize without loop.
while search("[0-9] [0-9]", num_v):
num_v = sub("([0-9]) ([0-9])", r"\1\2", num_v)

output += num_v if use_num_v else text_v

# if the end of the string is not reached, backtrack to the
# delimiter after the last numerical token (i.e. two tokens
# ago).
#
# Else, we are at the end of the string, so we are done!
i = j - 1 if j < tk_ct - 1 else j
break

# this is run when we are not yet at the end of the string and
# have not yet reached a non-numerical token. This is identical
# to the code that is run above when the last token is numeric,
# except that whitespace after the token is stripped.
m = search(token_ptn, tk_j)
text_v += m[1] + " "
num_v += m[2] if len(m[2]) else m[1]
num_v += " "

else:
logger.debug(f"No numeric match: adding {tk_i}.")
output += tk_i

i += 1

logger.debug(f"Use num version: {use_num_v}")
ctx.dest = output

# Skip main transliterate function joining.

return normalize_spacing_post_assembly(ctx)
144 changes: 144 additions & 0 deletions scriptshifter/hooks/chinese/processNumbers.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
private processNumbers(pinyinString: string, tag: string, code: string): string {
let outputString = "";
let useNumVersion = false;
//useNumVersion is set in specific subfields where we definitely want to treat numbers as numbers
if ((tag == "245" || tag == "830") && code == "n") {
useNumVersion = true;
}

/*
* The input string is split, with any space or punctuation character (except for #) as the delimiter.
* The delimiters will be captured and included in the string of tokens. Only the even-numbered
* array elements are the true 'tokens', so the code for processing tokens is run only for even
* values of j.
*/
let tokens: string[] = pinyinString.split(new RegExp("([^\\P{P}#]|\\s)","u"));
let numTokenPattern = "^([A-Za-z]+)#([0-9]*)$";
let numToken_re = new RegExp(numTokenPattern);
let n = tokens.length
//this.alert.info(tokens.join("|"),{autoClose: false})
for (let i = 0; i < n; i++) {
let toki = tokens[i];
if (toki.match(numToken_re)) {
/*
* When a numerical token (containing #) is reached, the inner loop consumes it and all consecutive numerical tokens
* found after it. Two versions of the string are maintained. The textVersion is the original pinyin (minus the
* # suffixes). In the numVersion, characters representing numbers are converted to Arabic numerals. When a
* non-numerical token (or end of string) is encountered, the string of numerical tokens is evaluated to determine
* which version should be used in the output string. The outer loop then continues where the inner loop left off.
*/
let textVersion = "";
let numVersion = "";
for (let j = i; j < n; j++) {
let tokj = tokens[j];
/* a token without # (or the end of string) is reached */
if ((j % 2 == 0 && !tokj.match(numToken_re)) || j == n - 1) {
//If this runs, then we are on the last token and it is numeric. Add text after # (if present) to numerical version
let m = tokj.match(numToken_re);
if (m) {
textVersion += m[1]
if (m[2] == "") {
numVersion += m[1];
} else {
numVersion += m[2];
}
} else if (j == n - 1) {
//if last token is non-numerical, just tack it on.
textVersion += tokj;
numVersion += tokj;
} else if (textVersion.length > 0 && numVersion.length > 0) {
//if not at end of string yet and token is non-numerical, remove the last delimiter that was appended
//(outer loop will pick up at this point)
textVersion = textVersion.substring(0, textVersion.length - 1);
numVersion = numVersion.substring(0, numVersion.length - 1);
}
//evaluate numerical string that has been constructed so far
//use num version for ordinals and date strings
if (numVersion.match(/^di [0-9]/i) ||
numVersion.match(/[0-9] [0-9] [0-9] [0-9]/) ||
numVersion.match(/[0-9]+ nian [0-9]+ yue/i) ||
numVersion.match(/"[0-9]+ yue [0-9]+ ri/i) ||
useNumVersion
) {
useNumVersion = true;
/*
* At this point, string may contain literal translations of Chinese numerals
* Convert these to Arabic numerals (for example "2 10 7" = "27").
*/

while (numVersion.match(/[0-9] 10+/) || numVersion.match(/[1-9]0+ [1-9]/)) {
m = numVersion.match(/([0-9]+) ([1-9]0+)/);
if (m) {
let sum = Number(m[1]) * Number(m[2]);
numVersion = numVersion.replace(/[0-9]+ [1-9]0+/, String(sum));
} else {
let mb = numVersion.match(/([1-9]0+) ([0-9]+)/);
if (mb)
{
let sumb = Number(mb[1]) + Number(mb[2]);
numVersion = numVersion.replace(/[1-9]0+ [0-9]+/, String(sumb));
}
else
{
break;
}
}
}

//A few other tweaks
numVersion = numVersion.replace(/([0-9]) ([0-9]) ([0-9]) ([0-9])/g, "$1$2$3$4");
if ((tag == "245" || tag == "830") && code == "n") {
while (numVersion.match(/[0-9] [0-9]/)) {
numVersion = numVersion.replace(/([0-9]) ([0-9])/, "$1$2");
}
}
}
if (useNumVersion)
{
outputString += numVersion;
}
else
{
outputString += textVersion;
}
//if the end of the string is not reached, backtrack to the delimiter after the last numerical token
//(i.e. two tokens ago)
if (j < n - 1)
{
i = j - 2;
}
else //we are at the end of the string, so we are done!
{
i = j;
}
break;
}
//this is run when we are not yet at the end of the string and have not yet reached a non-numerical token
//This is identical to the code that is run above when the last token is numeric.
if (j % 2 == 0)
{
let m = tokj.match(numToken_re);
textVersion += m[1];
if (m[2]== "")
{
numVersion += m[1];
}
else
{
numVersion += m[2];
}
}
else //a delimiter, just tack it on.
{
textVersion += tokj;
numVersion += tokj;
}
}
}
else // the outer loop has encountered a non-numeric token or delimiter, just tack it on.
{
outputString += toki;
}
}
return outputString;
}
31 changes: 31 additions & 0 deletions scriptshifter/hooks/general/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
__doc__ = """
General-purpose hooks.
"""

from logging import getLogger
from re import compile

from scriptshifter.trans import MULTI_WS_RE


NORM1_RE = compile(r"([.,;:\)\]}])\s")
NORM2_RE = compile(r"(\S)([.,;:\)\]}])")
NORM3_RE = compile(r"\s([\)\]\}])")
NORM4_RE = compile(r"([\)\]\}])(\S)")

logger = getLogger(__name__)


def normalize_spacing_post_assembly(ctx):
"""
Remove duplicate and unwanted whitespace around punctuation.
"""
# De-duplicate whitespace.
logger.debug(f"Dest pre manipulation: {ctx.dest}")
norm = MULTI_WS_RE.sub(r"\1", ctx.dest.strip())
norm = NORM1_RE.sub(r"\1", norm)
norm = NORM2_RE.sub(r"\1 \2", norm)
norm = NORM3_RE.sub(r"\1", norm)
norm = NORM4_RE.sub(r"\1 \2", norm)

return norm
20 changes: 17 additions & 3 deletions scriptshifter/tables/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
except ImportError:
from yaml import Loader

from scriptshifter.exceptions import ConfigError
from scriptshifter.exceptions import BREAK, ConfigError


__doc__ = """
Expand Down Expand Up @@ -126,7 +126,10 @@ def __hash__(self):
@cache
def list_tables():
"""
List all the available tables.
List all the indexed tables.
Note that this may not correspond to all the table files in the data
folder, but only those exposed in the index.
"""
with open(path.join(TABLE_DIR, "index.yml")) as fh:
tdata = load(fh, Loader=Loader)
Expand All @@ -150,7 +153,18 @@ def load_table(tname):
with open(fname) as fh:
tdata = load(fh, Loader=Loader)

# NOTE Only one level of inheritance. No need for recursion for now.
# Pre-config hooks.
# If any of these hooks returns BREAK, interrupt the configuration
# parsing and return whatever is obtained so far.
if "hooks" in tdata:
tdata["hooks"] = load_hook_fn(tname, tdata)
pre_cfg_hooks = tdata.get("hooks", {}).get("pre_config", [])
for hook_def in pre_cfg_hooks:
kwargs = hook_def[1] if len(hook_def) > 1 else {}
ret = hook_def[0](tdata, **kwargs)
if ret == BREAK:
return tdata

parents = tdata.get("general", {}).get("parents", [])

if "script_to_roman" in tdata:
Expand Down
Loading

0 comments on commit 8c7ce0e

Please sign in to comment.