Skip to content

Commit

Permalink
Merge branch 're' into test
Browse files Browse the repository at this point in the history
  • Loading branch information
scossu committed Dec 3, 2024
2 parents fbfdec4 + 91f3ad9 commit 9b98c2e
Show file tree
Hide file tree
Showing 28 changed files with 445 additions and 329 deletions.
2 changes: 1 addition & 1 deletion doc/rest_api.md
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ MIME type: `application/json`

Content: JSON object with the following keys:

- `lang`: Language code as given by the `/languages` endpoint.
- `lang`: Language code as given by the `/languages` endpoint.
- `text`: Input text to be transliterated.
- `capitalize`: One of `first` (capitalize the first letter of the input),
`all` (capitalize all words separated by spaces), or null (default: apply no
Expand Down
2 changes: 2 additions & 0 deletions example.env
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,6 @@ FLASK_DEBUG=true
TXL_DICTA_EP="changeme"
TXL_FLASK_SECRET="changeme"
TXL_LOGLEVEL="INFO"
TXL_EMAIL_FROM="[email protected]"
TXL_EMAIL_TO="[email protected]"
LOSHN_KOYDESH_O2P_SRC="${PWD}/scriptshifter/hooks/yiddish_/loshn_koydesh_o2p_override.tsv"
144 changes: 144 additions & 0 deletions legacy/processNumbers.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
private processNumbers(pinyinString: string, tag: string, code: string): string {
let outputString = "";
let useNumVersion = false;
//useNumVersion is set in specific subfields where we definitely want to treat numbers as numbers
if ((tag == "245" || tag == "830") && code == "n") {
useNumVersion = true;
}

/*
* The input string is split, with any space or punctuation character (except for #) as the delimiter.
* The delimiters will be captured and included in the string of tokens. Only the even-numbered
* array elements are the true 'tokens', so the code for processing tokens is run only for even
* values of j.
*/
let tokens: string[] = pinyinString.split(new RegExp("([^\\P{P}#]|\\s)","u"));
let numTokenPattern = "^([A-Za-z]+)#([0-9]*)$";
let numToken_re = new RegExp(numTokenPattern);
let n = tokens.length
//this.alert.info(tokens.join("|"),{autoClose: false})
for (let i = 0; i < n; i++) {
let toki = tokens[i];
if (toki.match(numToken_re)) {
/*
* When a numerical token (containing #) is reached, the inner loop consumes it and all consecutive numerical tokens
* found after it. Two versions of the string are maintained. The textVersion is the original pinyin (minus the
* # suffixes). In the numVersion, characters representing numbers are converted to Arabic numerals. When a
* non-numerical token (or end of string) is encountered, the string of numerical tokens is evaluated to determine
* which version should be used in the output string. The outer loop then continues where the inner loop left off.
*/
let textVersion = "";
let numVersion = "";
for (let j = i; j < n; j++) {
let tokj = tokens[j];
/* a token without # (or the end of string) is reached */
if ((j % 2 == 0 && !tokj.match(numToken_re)) || j == n - 1) {
//If this runs, then we are on the last token and it is numeric. Add text after # (if present) to numerical version
let m = tokj.match(numToken_re);
if (m) {
textVersion += m[1]
if (m[2] == "") {
numVersion += m[1];
} else {
numVersion += m[2];
}
} else if (j == n - 1) {
//if last token is non-numerical, just tack it on.
textVersion += tokj;
numVersion += tokj;
} else if (textVersion.length > 0 && numVersion.length > 0) {
//if not at end of string yet and token is non-numerical, remove the last delimiter that was appended
//(outer loop will pick up at this point)
textVersion = textVersion.substring(0, textVersion.length - 1);
numVersion = numVersion.substring(0, numVersion.length - 1);
}
//evaluate numerical string that has been constructed so far
//use num version for ordinals and date strings
if (numVersion.match(/^di [0-9]/i) ||
numVersion.match(/[0-9] [0-9] [0-9] [0-9]/) ||
numVersion.match(/[0-9]+ nian [0-9]+ yue/i) ||
numVersion.match(/"[0-9]+ yue [0-9]+ ri/i) ||
useNumVersion
) {
useNumVersion = true;
/*
* At this point, string may contain literal translations of Chinese numerals
* Convert these to Arabic numerals (for example "2 10 7" = "27").
*/

while (numVersion.match(/[0-9] 10+/) || numVersion.match(/[1-9]0+ [1-9]/)) {
m = numVersion.match(/([0-9]+) ([1-9]0+)/);
if (m) {
let sum = Number(m[1]) * Number(m[2]);
numVersion = numVersion.replace(/[0-9]+ [1-9]0+/, String(sum));
} else {
let mb = numVersion.match(/([1-9]0+) ([0-9]+)/);
if (mb)
{
let sumb = Number(mb[1]) + Number(mb[2]);
numVersion = numVersion.replace(/[1-9]0+ [0-9]+/, String(sumb));
}
else
{
break;
}
}
}

//A few other tweaks
numVersion = numVersion.replace(/([0-9]) ([0-9]) ([0-9]) ([0-9])/g, "$1$2$3$4");
if ((tag == "245" || tag == "830") && code == "n") {
while (numVersion.match(/[0-9] [0-9]/)) {
numVersion = numVersion.replace(/([0-9]) ([0-9])/, "$1$2");
}
}
}
if (useNumVersion)
{
outputString += numVersion;
}
else
{
outputString += textVersion;
}
//if the end of the string is not reached, backtrack to the delimiter after the last numerical token
//(i.e. two tokens ago)
if (j < n - 1)
{
i = j - 2;
}
else //we are at the end of the string, so we are done!
{
i = j;
}
break;
}
//this is run when we are not yet at the end of the string and have not yet reached a non-numerical token
//This is identical to the code that is run above when the last token is numeric.
if (j % 2 == 0)
{
let m = tokj.match(numToken_re);
textVersion += m[1];
if (m[2]== "")
{
numVersion += m[1];
}
else
{
numVersion += m[2];
}
}
else //a delimiter, just tack it on.
{
textVersion += tokj;
numVersion += tokj;
}
}
}
else // the outer loop has encountered a non-numeric token or delimiter, just tack it on.
{
outputString += toki;
}
}
return outputString;
}
2 changes: 1 addition & 1 deletion scriptshifter/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
This DB stores all the runtime transliteration data.
"""
DB_PATH = environ.get(
"DB_PATH", path.join(APP_ROOT, "data", "scriptshifter.db"))
"TXL_DB_PATH", path.join(APP_ROOT, "data", "scriptshifter.db"))

"""
SMTP server for sending email. For a dummy server that just echoes the
Expand Down
104 changes: 52 additions & 52 deletions scriptshifter/tables/__init__.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
import logging
import re
import sqlite3

from collections import defaultdict
from functools import cache
from importlib import import_module
from json import dumps as jdumps, loads as jloads
from os import R_OK, access, environ, makedirs, path, unlink
from re import compile
from shutil import move

from yaml import load
Expand All @@ -28,9 +28,6 @@
"""


TMP_DB_PATH = path.join(
path.dirname(DB_PATH), "~tmp." + path.basename(DB_PATH))

DEFAULT_TABLE_DIR = path.join(path.dirname(path.realpath(__file__)), "data")
# Can be overridden for tests.
TABLE_DIR = environ.get("TXL_CONFIG_TABLE_DIR", DEFAULT_TABLE_DIR)
Expand Down Expand Up @@ -143,15 +140,17 @@ def init_db():
This operation removes any preexisting database.
All tables in the index file (`./data/index.yml`) will be parsed
All tables in the index file (`./index.yml`) will be parsed
(including inheritance rules) and loaded into the designated DB.
This must be done only once at bootstrap. To update individual tables,
see populate_table(), which this function calls iteratively.
"""
# Create parent diretories if necessary.
# If the DB already exists, it will be overwritten ONLY on success at
# hhis point.
# this point.
TMP_DB_PATH = path.join(
path.dirname(DB_PATH), "~tmp." + path.basename(DB_PATH))
if path.isfile(TMP_DB_PATH):
# Remove previous temp file (possibly from failed attempt)
unlink(TMP_DB_PATH)
Expand All @@ -166,25 +165,17 @@ def init_db():
conn.executescript(fh.read())

# Populate tables.
with open(path.join(TABLE_DIR, "index.yml")) as fh:
with open(path.join(path.dirname(TABLE_DIR), "index.yml")) as fh:
tlist = load(fh, Loader=Loader)
try:
with conn:
for tname, tdata in tlist.items():
res = conn.execute(
"""INSERT INTO tbl_language (
name, label, marc_code, description
) VALUES (?, ?, ?, ?)""",
(
tname, tdata.get("name"), tdata.get("marc_code"),
tdata.get("description"),
)
)
populate_table(conn, res.lastrowid, tname)
populate_table(conn, tname, tdata)

# If the DB already exists, it will be overwritten ONLY on success at
# thhis point.
move(TMP_DB_PATH, DB_PATH)
logger.info(f"Database initialized at {DB_PATH}.")
finally:
conn.close()
if path.isfile(TMP_DB_PATH):
Expand All @@ -201,7 +192,27 @@ def get_connection():
return sqlite3.connect(DB_PATH)


def populate_table(conn, tid, tname):
def populate_table(conn, tname, tdata):
"""
Populate an individual table with data from a configuration.
@param conn: SQLite connection.
@param tname(str): Table name.
@param tdata(dict): Table data.
"""
res = conn.execute(
"""INSERT INTO tbl_language (
name, label, marc_code, description
) VALUES (?, ?, ?, ?)""",
(
tname, tdata.get("name"), tdata.get("marc_code"),
tdata.get("description"),
)
)
tid = res.lastrowid

data = load_table(tname)
flags = 0
if "script_to_roman" in data:
Expand Down Expand Up @@ -247,20 +258,19 @@ def populate_table(conn, tid, tname):
hook_data[1].__name__, jdumps(hook_data[2])))

# Ignore rules (R2S only).
for row in sec.get("ignore", []):
if isinstance(row, dict):
if "re" in row:
flags = FEAT_RE
rule = row["re"]
else:
flags = 0
rule = row
for rule in sec.get("ignore", []):
conn.execute(
"""INSERT INTO tbl_ignore (
lang_id, rule, features
) VALUES (?, ?, ?)""",
(tid, rule, 0))

for rule in sec.get("ignore_ptn", []):
conn.execute(
"""INSERT INTO tbl_ignore (
lang_id, rule, features
) VALUES (?, ?, ?)""",
(tid, rule, flags))
(tid, rule, FEAT_RE))

# Double caps (S2R only).
for rule in sec.get("double_cap", []):
Expand Down Expand Up @@ -417,33 +427,22 @@ def load_table(tname):

# Ignore regular expression patterns.
# Patterns are evaluated in the order they are listed in the config.
ignore_ptn = [
re.compile(ptn)
for ptn in tdata["roman_to_script"].get("ignore_ptn", [])]
ignore_ptn = tdata["roman_to_script"].get("ignore_ptn", [])
for parent in parents:
parent_tdata = load_table(parent)
# NOTE: duplicates are not removed.
ignore_ptn = [
re.compile(ptn)
for ptn in parent_tdata.get(
"roman_to_script", {}).get("ignore_ptn", [])
] + ignore_ptn
ignore_ptn = parent_tdata.get(
"roman_to_script", {}).get("ignore_ptn", []) + ignore_ptn
tdata["roman_to_script"]["ignore_ptn"] = ignore_ptn

# Ignore plain strings.
ignore = {
Token(t)
for t in tdata["roman_to_script"].get("ignore", [])
}
ignore = set(tdata["roman_to_script"].get("ignore", []))
for parent in parents:
parent_tdata = load_table(parent)
# No overriding occurs with the ignore list, only de-duplication.
ignore |= {
Token(t) for t in parent_tdata.get(
"roman_to_script", {}).get("ignore", [])
}
tdata["roman_to_script"]["ignore"] = [
t.content for t in sorted(ignore)]
ignore |= set(parent_tdata.get(
"roman_to_script", {}).get("ignore", []))
tdata["roman_to_script"]["ignore"] = sorted(ignore)

# Hooks.
if "hooks" in tdata["roman_to_script"]:
Expand Down Expand Up @@ -521,6 +520,10 @@ def get_language(lang):
if len(s2r_hooks):
data["script_to_roman"]["hooks"] = s2r_hooks

double_cap = get_lang_dcap(conn, lang_id)
if len(double_cap):
data["script_to_roman"]["double_cap"] = double_cap

# Roman to script map, ignore list, and hooks.

if data["has_r2s"]:
Expand All @@ -542,10 +545,6 @@ def get_language(lang):
if len(opt_data):
data["options"] = opt_data

double_cap = get_lang_dcap(conn, lang_id)
if len(double_cap):
data["double_cap"] = double_cap

conn.close()

return data
Expand Down Expand Up @@ -591,8 +590,9 @@ def get_lang_ignore(conn, lang_id):
"""SELECT rule, features FROM tbl_ignore
WHERE lang_id = ?""",
(lang_id,))
# Features (regular expressions) not implemented yet.
return tuple(row[0] for row in qry)
return tuple(
compile(row[0]) if row[1] & FEAT_RE else row[0]
for row in qry)


@cache
Expand Down Expand Up @@ -652,7 +652,7 @@ def get_lang_hooks(conn, lang_id, t_dir):
}
)

return hooks
return dict(hooks)


def get_lang_dcap(conn, lang_id):
Expand Down
Loading

0 comments on commit 9b98c2e

Please sign in to comment.