-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* WIP Parse Chinese numerals. * WIP complete number parsing. * Complete Chinese numerals: * Use standard table override instead of pre-config hooks. * Add few test strings. * Complete numerals: * Transliterate all numeric examples correctly * Modify hook return logic for consistency * WIP partial spacing fix. * Some cleanup; upgrade docker OS. * Add dependency for uwsgi.
- Loading branch information
Showing
10 changed files
with
46,047 additions
and
45,661 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,131 @@ | ||
__doc__ = """Chinese hooks.""" | ||
|
||
|
||
from logging import getLogger | ||
from re import I, compile, search, sub | ||
|
||
from scriptshifter.hooks.general import normalize_spacing_post_assembly | ||
|
||
|
||
logger = getLogger(__name__) | ||
|
||
|
||
def parse_numerals_pre_assembly(ctx): | ||
""" | ||
Parse Chinese numerals in the already romanized result. | ||
This is run at post-assembly. | ||
""" | ||
# Only apply to specific MARC fields. | ||
use_num_v = ctx.options.get("marc_field") in ("245n", "830n") | ||
|
||
# tokens = split(r"[\W^#]", ctx.dest) # Original logic. | ||
tk_ct = len(ctx.dest_ls) | ||
token_ptn = compile(r"^([A-Za-z]+)#([0-9]*)(\s*)$") | ||
|
||
output = "" | ||
|
||
# Use manual loop as i is manipulated inside it. | ||
i = 0 | ||
|
||
while i < tk_ct: | ||
tk_i = ctx.dest_ls[i] | ||
if search(token_ptn, tk_i): | ||
# When a numerical token (containing #) is reached, the inner loop | ||
# consumes it and all consecutive numerical tokens found after it. | ||
# Two versions of the string are maintained. The textVersion is | ||
# the original pinyin (minus the # suffixes). In the numVersion, | ||
# characters representing numbers are converted to Arabic | ||
# numerals. When a non-numerical token (or end of string) is | ||
# encountered, the string of numerical tokens is evaluated to | ||
# determine which version should be used in the output string. | ||
# The outer loop then continues where the inner loop left off. | ||
logger.debug(f"Match number: {tk_i}.") | ||
text_v = num_v = "" | ||
for j in range(i, tk_ct): | ||
tk_j = ctx.dest_ls[j] | ||
m = search(token_ptn, tk_j) | ||
# if m: | ||
# logger.debug(f"m[1]: {m[1]} - m[2]: {m[2]}") | ||
# a token without # (or the end of string) is reached | ||
if not m or j == tk_ct - 1: | ||
logger.debug(f"Next token is not numeric: {tk_j}") | ||
# If this runs, then we are on the last token and it is | ||
# numeric. Add text after # (if present) to numerical | ||
# version and captured whitespace after the number. | ||
if m: | ||
text_v += m[1] + m[3] | ||
num_v += m[2] + m[3] if len(m[2]) else m[1] + m[3] | ||
# Append white space. | ||
num_v += " " | ||
elif j == tk_ct - 1: | ||
# if last token is non-numerical, just tack it on. | ||
logger.debug(f"Last token is non-numerical: {tk_j}") | ||
text_v += tk_j | ||
num_v += tk_j | ||
# evaluate numerical string that has been constructed so | ||
# far. Use num version for ordinals and date strings | ||
if ( | ||
search("^di [0-9]", num_v, flags=I) or | ||
search("[0-9] [0-9] [0-9] [0-9]", num_v) or | ||
search("[0-9]+ nian [0-9]+ yue", num_v, flags=I) or | ||
search("[0-9]+ yue [0-9]+ ri", num_v, flags=I) | ||
): | ||
use_num_v = True | ||
# At this point, string may contain literal | ||
# translations of Chinese numerals Convert these to | ||
# Arabic numerals (for example "2 10 7" = "27"). | ||
mult_ptn = compile(r"(\b[0-9]) ([1-9]0+)") | ||
sum_ptn = compile("([1-9]0+) ([0-9]+)") | ||
while _m := search("[0-9] 10+|[1-9]0+ [1-9]", num_v): | ||
logger.debug(f"Match number combination: {_m}") | ||
if m := mult_ptn.search(num_v): | ||
logger.debug(f"Multiply: {m[1]}, {m[2]}") | ||
parsed = int(m[1]) * int(m[2]) | ||
num_v = mult_ptn.sub(str(parsed), num_v, 1) | ||
elif m := sum_ptn.search(num_v): | ||
logger.debug(f"Add: {m[1]}, {m[2]}") | ||
parsed = int(m[1]) + int(m[2]) | ||
num_v = sum_ptn.sub(str(parsed), num_v, 1) | ||
else: | ||
break | ||
# A few other tweaks | ||
num_v = sub( | ||
"([0-9]) ([0-9]) ([0-9]) ([0-9])", | ||
r"\1\2\3\4", num_v) | ||
if ctx.options.get("marc_field") in ("245", "830"): | ||
# TODO optimize without loop. | ||
while search("[0-9] [0-9]", num_v): | ||
num_v = sub("([0-9]) ([0-9])", r"\1\2", num_v) | ||
|
||
output += num_v if use_num_v else text_v | ||
|
||
# if the end of the string is not reached, backtrack to the | ||
# delimiter after the last numerical token (i.e. two tokens | ||
# ago). | ||
# | ||
# Else, we are at the end of the string, so we are done! | ||
i = j - 1 if j < tk_ct - 1 else j | ||
break | ||
|
||
# this is run when we are not yet at the end of the string and | ||
# have not yet reached a non-numerical token. This is identical | ||
# to the code that is run above when the last token is numeric, | ||
# except that whitespace after the token is stripped. | ||
m = search(token_ptn, tk_j) | ||
text_v += m[1] + " " | ||
num_v += m[2] if len(m[2]) else m[1] | ||
num_v += " " | ||
|
||
else: | ||
logger.debug(f"No numeric match: adding {tk_i}.") | ||
output += tk_i | ||
|
||
i += 1 | ||
|
||
logger.debug(f"Use num version: {use_num_v}") | ||
ctx.dest = output | ||
|
||
# Skip main transliterate function joining. | ||
|
||
return normalize_spacing_post_assembly(ctx) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,144 @@ | ||
private processNumbers(pinyinString: string, tag: string, code: string): string { | ||
let outputString = ""; | ||
let useNumVersion = false; | ||
//useNumVersion is set in specific subfields where we definitely want to treat numbers as numbers | ||
if ((tag == "245" || tag == "830") && code == "n") { | ||
useNumVersion = true; | ||
} | ||
|
||
/* | ||
* The input string is split, with any space or punctuation character (except for #) as the delimiter. | ||
* The delimiters will be captured and included in the string of tokens. Only the even-numbered | ||
* array elements are the true 'tokens', so the code for processing tokens is run only for even | ||
* values of j. | ||
*/ | ||
let tokens: string[] = pinyinString.split(new RegExp("([^\\P{P}#]|\\s)","u")); | ||
let numTokenPattern = "^([A-Za-z]+)#([0-9]*)$"; | ||
let numToken_re = new RegExp(numTokenPattern); | ||
let n = tokens.length | ||
//this.alert.info(tokens.join("|"),{autoClose: false}) | ||
for (let i = 0; i < n; i++) { | ||
let toki = tokens[i]; | ||
if (toki.match(numToken_re)) { | ||
/* | ||
* When a numerical token (containing #) is reached, the inner loop consumes it and all consecutive numerical tokens | ||
* found after it. Two versions of the string are maintained. The textVersion is the original pinyin (minus the | ||
* # suffixes). In the numVersion, characters representing numbers are converted to Arabic numerals. When a | ||
* non-numerical token (or end of string) is encountered, the string of numerical tokens is evaluated to determine | ||
* which version should be used in the output string. The outer loop then continues where the inner loop left off. | ||
*/ | ||
let textVersion = ""; | ||
let numVersion = ""; | ||
for (let j = i; j < n; j++) { | ||
let tokj = tokens[j]; | ||
/* a token without # (or the end of string) is reached */ | ||
if ((j % 2 == 0 && !tokj.match(numToken_re)) || j == n - 1) { | ||
//If this runs, then we are on the last token and it is numeric. Add text after # (if present) to numerical version | ||
let m = tokj.match(numToken_re); | ||
if (m) { | ||
textVersion += m[1] | ||
if (m[2] == "") { | ||
numVersion += m[1]; | ||
} else { | ||
numVersion += m[2]; | ||
} | ||
} else if (j == n - 1) { | ||
//if last token is non-numerical, just tack it on. | ||
textVersion += tokj; | ||
numVersion += tokj; | ||
} else if (textVersion.length > 0 && numVersion.length > 0) { | ||
//if not at end of string yet and token is non-numerical, remove the last delimiter that was appended | ||
//(outer loop will pick up at this point) | ||
textVersion = textVersion.substring(0, textVersion.length - 1); | ||
numVersion = numVersion.substring(0, numVersion.length - 1); | ||
} | ||
//evaluate numerical string that has been constructed so far | ||
//use num version for ordinals and date strings | ||
if (numVersion.match(/^di [0-9]/i) || | ||
numVersion.match(/[0-9] [0-9] [0-9] [0-9]/) || | ||
numVersion.match(/[0-9]+ nian [0-9]+ yue/i) || | ||
numVersion.match(/"[0-9]+ yue [0-9]+ ri/i) || | ||
useNumVersion | ||
) { | ||
useNumVersion = true; | ||
/* | ||
* At this point, string may contain literal translations of Chinese numerals | ||
* Convert these to Arabic numerals (for example "2 10 7" = "27"). | ||
*/ | ||
|
||
while (numVersion.match(/[0-9] 10+/) || numVersion.match(/[1-9]0+ [1-9]/)) { | ||
m = numVersion.match(/([0-9]+) ([1-9]0+)/); | ||
if (m) { | ||
let sum = Number(m[1]) * Number(m[2]); | ||
numVersion = numVersion.replace(/[0-9]+ [1-9]0+/, String(sum)); | ||
} else { | ||
let mb = numVersion.match(/([1-9]0+) ([0-9]+)/); | ||
if (mb) | ||
{ | ||
let sumb = Number(mb[1]) + Number(mb[2]); | ||
numVersion = numVersion.replace(/[1-9]0+ [0-9]+/, String(sumb)); | ||
} | ||
else | ||
{ | ||
break; | ||
} | ||
} | ||
} | ||
|
||
//A few other tweaks | ||
numVersion = numVersion.replace(/([0-9]) ([0-9]) ([0-9]) ([0-9])/g, "$1$2$3$4"); | ||
if ((tag == "245" || tag == "830") && code == "n") { | ||
while (numVersion.match(/[0-9] [0-9]/)) { | ||
numVersion = numVersion.replace(/([0-9]) ([0-9])/, "$1$2"); | ||
} | ||
} | ||
} | ||
if (useNumVersion) | ||
{ | ||
outputString += numVersion; | ||
} | ||
else | ||
{ | ||
outputString += textVersion; | ||
} | ||
//if the end of the string is not reached, backtrack to the delimiter after the last numerical token | ||
//(i.e. two tokens ago) | ||
if (j < n - 1) | ||
{ | ||
i = j - 2; | ||
} | ||
else //we are at the end of the string, so we are done! | ||
{ | ||
i = j; | ||
} | ||
break; | ||
} | ||
//this is run when we are not yet at the end of the string and have not yet reached a non-numerical token | ||
//This is identical to the code that is run above when the last token is numeric. | ||
if (j % 2 == 0) | ||
{ | ||
let m = tokj.match(numToken_re); | ||
textVersion += m[1]; | ||
if (m[2]== "") | ||
{ | ||
numVersion += m[1]; | ||
} | ||
else | ||
{ | ||
numVersion += m[2]; | ||
} | ||
} | ||
else //a delimiter, just tack it on. | ||
{ | ||
textVersion += tokj; | ||
numVersion += tokj; | ||
} | ||
} | ||
} | ||
else // the outer loop has encountered a non-numeric token or delimiter, just tack it on. | ||
{ | ||
outputString += toki; | ||
} | ||
} | ||
return outputString; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
__doc__ = """ | ||
General-purpose hooks. | ||
""" | ||
|
||
from logging import getLogger | ||
from re import compile | ||
|
||
from scriptshifter.trans import MULTI_WS_RE | ||
|
||
|
||
NORM1_RE = compile(r"([.,;:\)\]}])\s") | ||
NORM2_RE = compile(r"(\S)([.,;:\)\]}])") | ||
NORM3_RE = compile(r"\s([\)\]\}])") | ||
NORM4_RE = compile(r"([\)\]\}])(\S)") | ||
|
||
logger = getLogger(__name__) | ||
|
||
|
||
def normalize_spacing_post_assembly(ctx): | ||
""" | ||
Remove duplicate and unwanted whitespace around punctuation. | ||
""" | ||
# De-duplicate whitespace. | ||
logger.debug(f"Dest pre manipulation: {ctx.dest}") | ||
norm = MULTI_WS_RE.sub(r"\1", ctx.dest.strip()) | ||
norm = NORM1_RE.sub(r"\1", norm) | ||
norm = NORM2_RE.sub(r"\1 \2", norm) | ||
norm = NORM3_RE.sub(r"\1", norm) | ||
norm = NORM4_RE.sub(r"\1 \2", norm) | ||
|
||
return norm |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.