From 408e57a42f873aaaba27da8457a32f2b7f5cf58f Mon Sep 17 00:00:00 2001 From: scossu Date: Mon, 21 Oct 2024 21:04:07 -0400 Subject: [PATCH 1/6] WIP ignore by regular expression. --- scriptshifter/tables/__init__.py | 48 ++++----- scriptshifter/tables/data/_ignore_base.yml | 111 +++------------------ 2 files changed, 32 insertions(+), 127 deletions(-) diff --git a/scriptshifter/tables/__init__.py b/scriptshifter/tables/__init__.py index a2098c9..725192f 100644 --- a/scriptshifter/tables/__init__.py +++ b/scriptshifter/tables/__init__.py @@ -1,5 +1,4 @@ import logging -import re import sqlite3 from collections import defaultdict @@ -7,6 +6,7 @@ from importlib import import_module from json import dumps as jdumps, loads as jloads from os import R_OK, access, environ, makedirs, path, unlink +from re import compile from shutil import move from yaml import load @@ -247,20 +247,19 @@ def populate_table(conn, tid, tname): hook_data[1].__name__, jdumps(hook_data[2]))) # Ignore rules (R2S only). - for row in sec.get("ignore", []): - if isinstance(row, dict): - if "re" in row: - flags = FEAT_RE - rule = row["re"] - else: - flags = 0 - rule = row + for rule in sec.get("ignore", []): + conn.execute( + """INSERT INTO tbl_ignore ( + lang_id, rule, features + ) VALUES (?, ?, ?)""", + (tid, rule, 0)) + for rule in sec.get("ignore_ptn", []): conn.execute( """INSERT INTO tbl_ignore ( lang_id, rule, features ) VALUES (?, ?, ?)""", - (tid, rule, flags)) + (tid, rule, FEAT_RE)) # Double caps (S2R only). for rule in sec.get("double_cap", []): @@ -417,33 +416,22 @@ def load_table(tname): # Ignore regular expression patterns. # Patterns are evaluated in the order they are listed in the config. - ignore_ptn = [ - re.compile(ptn) - for ptn in tdata["roman_to_script"].get("ignore_ptn", [])] + ignore_ptn = tdata["roman_to_script"].get("ignore_ptn", []) for parent in parents: parent_tdata = load_table(parent) # NOTE: duplicates are not removed. - ignore_ptn = [ - re.compile(ptn) - for ptn in parent_tdata.get( - "roman_to_script", {}).get("ignore_ptn", []) - ] + ignore_ptn + ignore_ptn = parent_tdata.get( + "roman_to_script", {}).get("ignore_ptn", []) + ignore_ptn tdata["roman_to_script"]["ignore_ptn"] = ignore_ptn # Ignore plain strings. - ignore = { - Token(t) - for t in tdata["roman_to_script"].get("ignore", []) - } + ignore = set(tdata["roman_to_script"].get("ignore", [])) for parent in parents: parent_tdata = load_table(parent) # No overriding occurs with the ignore list, only de-duplication. - ignore |= { - Token(t) for t in parent_tdata.get( - "roman_to_script", {}).get("ignore", []) - } - tdata["roman_to_script"]["ignore"] = [ - t.content for t in sorted(ignore)] + ignore |= set(parent_tdata.get( + "roman_to_script", {}).get("ignore", [])) + tdata["roman_to_script"]["ignore"] = sorted(ignore) # Hooks. if "hooks" in tdata["roman_to_script"]: @@ -592,7 +580,9 @@ def get_lang_ignore(conn, lang_id): WHERE lang_id = ?""", (lang_id,)) # Features (regular expressions) not implemented yet. - return tuple(row[0] for row in qry) + return tuple( + compile(row[0]) if row[1] & FEAT_RE else row[0] + for row in qry) @cache diff --git a/scriptshifter/tables/data/_ignore_base.yml b/scriptshifter/tables/data/_ignore_base.yml index 4937783..da083ac 100644 --- a/scriptshifter/tables/data/_ignore_base.yml +++ b/scriptshifter/tables/data/_ignore_base.yml @@ -16,106 +16,21 @@ roman_to_script: # dedicated U+2160÷U+216F (uppercase Roman # numerals) and/or U+2170÷U+217F (lower case Roman # numerals) ranges to avoid this ambiguity. - # TODO implement regular expressions for ignore patterns. - #- re: "I{2,3}" - #- re: "I(V|X)" - #- re: "LI{,3}" - #- re: "LI?(V|X)" - #- re: "L(V|X{1,3})I{,3}" - #- re: "LX{1,3}I?V" - #- re: "LX{1,3}VI{,3}" - #- re: "(V|X{1,3})I{,3}" - #- re: "X{1,3}I{,3}" - #- re: "X{1,3}I(V|X)" - #- re: "X{1,3}VI{,3}" - - "II" - - "III" - - "IV" - - "IX" - - "LI" - - "LII" - - "LIII" - - "LIV" - - "LIX" - - "LV" - - "LVI" - - "LVII" - - "LVIII" - - "LX" - - "LXI" - - "LXII" - - "LXIII" - - "LXIV" - - "LXIX" - - "LXV" - - "LXVI" - - "LXVII" - - "LXVIII" - - "LXX" - - "LXXI" - - "LXXII" - - "LXXIII" - - "LXXIV" - - "LXXIX" - - "LXXV" - - "LXXVI" - - "LXXVII" - - "LXXVIII" - - "LXXX" - - "LXXXI" - - "LXXXII" - - "LXXXIII" - - "LXXXIV" - - "LXXXIX" - - "LXXXV" - - "LXXXVI" - - "LXXXVII" - - "LXXXVIII" - - "VI" - - "VII" - - "VIII" - - "XI" - - "XII" - - "XIII" - - "XIV" - - "XIX" - - "XL" - - "XLI" - - "XLII" - - "XLIII" - - "XLIV" - - "XLIX" - - "XLV" - - "XLVI" - - "XLVII" - - "XLVIII" - - "XV" - - "XVI" - - "XVII" - - "XVIII" - - "XX" - - "XXI" - - "XXII" - - "XXIII" - - "XXIV" - - "XXIX" - - "XXV" - - "XXVI" - - "XXVII" - - "XXVIII" - - "XXX" - - "XXXI" - - "XXXII" - - "XXXIII" - - "XXXIV" - - "XXXIX" - - "XXXV" - - "XXXVI" - - "XXXVII" - - "XXXVIII" - "and one other" - #- re: "and ([a-z0-9]+ )?others" - "et al." + ignore_ptn: + - "and ([a-z0-9]+ )?others" + - "I{2,3}" + - "I(V|X)" + - "LI{,3}" + - "LI?(V|X)" + - "L(V|X{1,3})I{,3}" + - "LX{1,3}I?V" + - "LX{1,3}VI{,3}" + - "(V|X{1,3})I{,3}" + - "X{1,3}I{,3}" + - "X{1,3}I(V|X)" + - "X{1,3}VI{,3}" script_to_roman: ignore: From 6c5cab47431440099b9c6981e4f3a4acca489741 Mon Sep 17 00:00:00 2001 From: scossu Date: Sat, 16 Nov 2024 17:08:12 -0500 Subject: [PATCH 2/6] WIP regexp and testing framework. --- example.env | 2 + legacy/processNumbers.ts | 144 ++++++++++++++++++++++ scriptshifter/tables/__init__.py | 40 +++--- scriptshifter/tables/{data => }/index.yml | 0 tests/__init__.py | 22 +++- tests/data/{ => config}/_base1.yml | 0 tests/data/{ => config}/_base2.yml | 0 tests/data/{ => config}/_base3.yml | 0 tests/data/{ => config}/cap_base1.yml | 0 tests/data/{ => config}/cap_base2.yml | 0 tests/data/{ => config}/cap_inherited.yml | 0 tests/data/{ => config}/index.yml | 0 tests/data/{ => config}/inherited.yml | 0 tests/data/{ => config}/ordering.yml | 0 tests/data/{ => config}/rot3.yml | 0 tests/data/script_samples/unittest.csv | 9 ++ tests/test02_transliteration.py | 39 +++--- 17 files changed, 217 insertions(+), 39 deletions(-) create mode 100644 legacy/processNumbers.ts rename scriptshifter/tables/{data => }/index.yml (100%) rename tests/data/{ => config}/_base1.yml (100%) rename tests/data/{ => config}/_base2.yml (100%) rename tests/data/{ => config}/_base3.yml (100%) rename tests/data/{ => config}/cap_base1.yml (100%) rename tests/data/{ => config}/cap_base2.yml (100%) rename tests/data/{ => config}/cap_inherited.yml (100%) rename tests/data/{ => config}/index.yml (100%) rename tests/data/{ => config}/inherited.yml (100%) rename tests/data/{ => config}/ordering.yml (100%) rename tests/data/{ => config}/rot3.yml (100%) create mode 100644 tests/data/script_samples/unittest.csv diff --git a/example.env b/example.env index 004c0d4..5e30eb8 100644 --- a/example.env +++ b/example.env @@ -2,4 +2,6 @@ FLASK_DEBUG=true TXL_DICTA_EP="changeme" TXL_FLASK_SECRET="changeme" TXL_LOGLEVEL="INFO" +TXL_EMAIL_FROM="me@loc.gov" +TXL_EMAIL_TO="me@loc.gov" LOSHN_KOYDESH_O2P_SRC="${PWD}/scriptshifter/hooks/yiddish_/loshn_koydesh_o2p_override.tsv" diff --git a/legacy/processNumbers.ts b/legacy/processNumbers.ts new file mode 100644 index 0000000..691e5ab --- /dev/null +++ b/legacy/processNumbers.ts @@ -0,0 +1,144 @@ +private processNumbers(pinyinString: string, tag: string, code: string): string { + let outputString = ""; + let useNumVersion = false; + //useNumVersion is set in specific subfields where we definitely want to treat numbers as numbers + if ((tag == "245" || tag == "830") && code == "n") { + useNumVersion = true; + } + + /* + * The input string is split, with any space or punctuation character (except for #) as the delimiter. + * The delimiters will be captured and included in the string of tokens. Only the even-numbered + * array elements are the true 'tokens', so the code for processing tokens is run only for even + * values of j. + */ + let tokens: string[] = pinyinString.split(new RegExp("([^\\P{P}#]|\\s)","u")); + let numTokenPattern = "^([A-Za-z]+)#([0-9]*)$"; + let numToken_re = new RegExp(numTokenPattern); + let n = tokens.length + //this.alert.info(tokens.join("|"),{autoClose: false}) + for (let i = 0; i < n; i++) { + let toki = tokens[i]; + if (toki.match(numToken_re)) { + /* + * When a numerical token (containing #) is reached, the inner loop consumes it and all consecutive numerical tokens + * found after it. Two versions of the string are maintained. The textVersion is the original pinyin (minus the + * # suffixes). In the numVersion, characters representing numbers are converted to Arabic numerals. When a + * non-numerical token (or end of string) is encountered, the string of numerical tokens is evaluated to determine + * which version should be used in the output string. The outer loop then continues where the inner loop left off. + */ + let textVersion = ""; + let numVersion = ""; + for (let j = i; j < n; j++) { + let tokj = tokens[j]; + /* a token without # (or the end of string) is reached */ + if ((j % 2 == 0 && !tokj.match(numToken_re)) || j == n - 1) { + //If this runs, then we are on the last token and it is numeric. Add text after # (if present) to numerical version + let m = tokj.match(numToken_re); + if (m) { + textVersion += m[1] + if (m[2] == "") { + numVersion += m[1]; + } else { + numVersion += m[2]; + } + } else if (j == n - 1) { + //if last token is non-numerical, just tack it on. + textVersion += tokj; + numVersion += tokj; + } else if (textVersion.length > 0 && numVersion.length > 0) { + //if not at end of string yet and token is non-numerical, remove the last delimiter that was appended + //(outer loop will pick up at this point) + textVersion = textVersion.substring(0, textVersion.length - 1); + numVersion = numVersion.substring(0, numVersion.length - 1); + } + //evaluate numerical string that has been constructed so far + //use num version for ordinals and date strings + if (numVersion.match(/^di [0-9]/i) || + numVersion.match(/[0-9] [0-9] [0-9] [0-9]/) || + numVersion.match(/[0-9]+ nian [0-9]+ yue/i) || + numVersion.match(/"[0-9]+ yue [0-9]+ ri/i) || + useNumVersion + ) { + useNumVersion = true; + /* + * At this point, string may contain literal translations of Chinese numerals + * Convert these to Arabic numerals (for example "2 10 7" = "27"). + */ + + while (numVersion.match(/[0-9] 10+/) || numVersion.match(/[1-9]0+ [1-9]/)) { + m = numVersion.match(/([0-9]+) ([1-9]0+)/); + if (m) { + let sum = Number(m[1]) * Number(m[2]); + numVersion = numVersion.replace(/[0-9]+ [1-9]0+/, String(sum)); + } else { + let mb = numVersion.match(/([1-9]0+) ([0-9]+)/); + if (mb) + { + let sumb = Number(mb[1]) + Number(mb[2]); + numVersion = numVersion.replace(/[1-9]0+ [0-9]+/, String(sumb)); + } + else + { + break; + } + } + } + + //A few other tweaks + numVersion = numVersion.replace(/([0-9]) ([0-9]) ([0-9]) ([0-9])/g, "$1$2$3$4"); + if ((tag == "245" || tag == "830") && code == "n") { + while (numVersion.match(/[0-9] [0-9]/)) { + numVersion = numVersion.replace(/([0-9]) ([0-9])/, "$1$2"); + } + } + } + if (useNumVersion) + { + outputString += numVersion; + } + else + { + outputString += textVersion; + } + //if the end of the string is not reached, backtrack to the delimiter after the last numerical token + //(i.e. two tokens ago) + if (j < n - 1) + { + i = j - 2; + } + else //we are at the end of the string, so we are done! + { + i = j; + } + break; + } + //this is run when we are not yet at the end of the string and have not yet reached a non-numerical token + //This is identical to the code that is run above when the last token is numeric. + if (j % 2 == 0) + { + let m = tokj.match(numToken_re); + textVersion += m[1]; + if (m[2]== "") + { + numVersion += m[1]; + } + else + { + numVersion += m[2]; + } + } + else //a delimiter, just tack it on. + { + textVersion += tokj; + numVersion += tokj; + } + } + } + else // the outer loop has encountered a non-numeric token or delimiter, just tack it on. + { + outputString += toki; + } + } + return outputString; + } diff --git a/scriptshifter/tables/__init__.py b/scriptshifter/tables/__init__.py index 725192f..0b576ca 100644 --- a/scriptshifter/tables/__init__.py +++ b/scriptshifter/tables/__init__.py @@ -143,7 +143,7 @@ def init_db(): This operation removes any preexisting database. - All tables in the index file (`./data/index.yml`) will be parsed + All tables in the index file (`./index.yml`) will be parsed (including inheritance rules) and loaded into the designated DB. This must be done only once at bootstrap. To update individual tables, @@ -151,7 +151,7 @@ def init_db(): """ # Create parent diretories if necessary. # If the DB already exists, it will be overwritten ONLY on success at - # hhis point. + # this point. if path.isfile(TMP_DB_PATH): # Remove previous temp file (possibly from failed attempt) unlink(TMP_DB_PATH) @@ -166,21 +166,12 @@ def init_db(): conn.executescript(fh.read()) # Populate tables. - with open(path.join(TABLE_DIR, "index.yml")) as fh: + with open(path.join(path.dirname(TABLE_DIR), "index.yml")) as fh: tlist = load(fh, Loader=Loader) try: with conn: for tname, tdata in tlist.items(): - res = conn.execute( - """INSERT INTO tbl_language ( - name, label, marc_code, description - ) VALUES (?, ?, ?, ?)""", - ( - tname, tdata.get("name"), tdata.get("marc_code"), - tdata.get("description"), - ) - ) - populate_table(conn, res.lastrowid, tname) + populate_table(conn, tname, tdata) # If the DB already exists, it will be overwritten ONLY on success at # thhis point. @@ -201,7 +192,27 @@ def get_connection(): return sqlite3.connect(DB_PATH) -def populate_table(conn, tid, tname): +def populate_table(conn, tname, tdata): + """ + Populate an individual table with data from a configuration. + + @param conn: SQLite connection. + + @param tname(str): Table name. + + @param tdata(dict): Table data. + """ + res = conn.execute( + """INSERT INTO tbl_language ( + name, label, marc_code, description + ) VALUES (?, ?, ?, ?)""", + ( + tname, tdata.get("name"), tdata.get("marc_code"), + tdata.get("description"), + ) + ) + tid = res.lastrowid + data = load_table(tname) flags = 0 if "script_to_roman" in data: @@ -579,7 +590,6 @@ def get_lang_ignore(conn, lang_id): """SELECT rule, features FROM tbl_ignore WHERE lang_id = ?""", (lang_id,)) - # Features (regular expressions) not implemented yet. return tuple( compile(row[0]) if row[1] & FEAT_RE else row[0] for row in qry) diff --git a/scriptshifter/tables/data/index.yml b/scriptshifter/tables/index.yml similarity index 100% rename from scriptshifter/tables/data/index.yml rename to scriptshifter/tables/index.yml diff --git a/tests/__init__.py b/tests/__init__.py index e4cde3e..50725a7 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -1,11 +1,10 @@ from csv import reader from difflib import ndiff +from glob import glob from importlib import reload from json import loads as jloads from logging import getLogger -from os import path - -import scriptshifter.tables +from os import environ, path from scriptshifter.trans import transliterate @@ -17,8 +16,20 @@ def reload_tables(): - reload(scriptshifter.tables) # Reload new config dir. + if "TXL_CONFIG_TABLE_DIR" in environ: + del environ["TXL_CONFIG_TABLE_DIR"] + + # import here to set modified test config dir. from scriptshifter import tables + + tables.init_db() + + for fname in glob(path.join(TEST_DATA_DIR, "config", ".yml")): + tname = path.splitext(path.basename(filename))[1] + with tables.get_connection() as conn: + tables.populate_table(conn, tname, {"name": fname}) + + tables.list_tables.cache_clear() tables.get_language.cache_clear() tables.get_lang_map.cache_clear() @@ -41,7 +52,10 @@ def test_sample(dset): with open(dset_fpath, newline="") as fh: csv = reader(fh) + i = 1 for row in csv: + logger.info(f"CSV row #{i}") + i += 1 lang, script, rom = row[:3] if not lang: continue diff --git a/tests/data/_base1.yml b/tests/data/config/_base1.yml similarity index 100% rename from tests/data/_base1.yml rename to tests/data/config/_base1.yml diff --git a/tests/data/_base2.yml b/tests/data/config/_base2.yml similarity index 100% rename from tests/data/_base2.yml rename to tests/data/config/_base2.yml diff --git a/tests/data/_base3.yml b/tests/data/config/_base3.yml similarity index 100% rename from tests/data/_base3.yml rename to tests/data/config/_base3.yml diff --git a/tests/data/cap_base1.yml b/tests/data/config/cap_base1.yml similarity index 100% rename from tests/data/cap_base1.yml rename to tests/data/config/cap_base1.yml diff --git a/tests/data/cap_base2.yml b/tests/data/config/cap_base2.yml similarity index 100% rename from tests/data/cap_base2.yml rename to tests/data/config/cap_base2.yml diff --git a/tests/data/cap_inherited.yml b/tests/data/config/cap_inherited.yml similarity index 100% rename from tests/data/cap_inherited.yml rename to tests/data/config/cap_inherited.yml diff --git a/tests/data/index.yml b/tests/data/config/index.yml similarity index 100% rename from tests/data/index.yml rename to tests/data/config/index.yml diff --git a/tests/data/inherited.yml b/tests/data/config/inherited.yml similarity index 100% rename from tests/data/inherited.yml rename to tests/data/config/inherited.yml diff --git a/tests/data/ordering.yml b/tests/data/config/ordering.yml similarity index 100% rename from tests/data/ordering.yml rename to tests/data/config/ordering.yml diff --git a/tests/data/rot3.yml b/tests/data/config/rot3.yml similarity index 100% rename from tests/data/rot3.yml rename to tests/data/config/rot3.yml diff --git a/tests/data/script_samples/unittest.csv b/tests/data/script_samples/unittest.csv new file mode 100644 index 0000000..432d293 --- /dev/null +++ b/tests/data/script_samples/unittest.csv @@ -0,0 +1,9 @@ +chinese,從易經解維摩詰經,臺北市大塊文化出版股份有限公司。,"cong yi jing jie wei mo jie jing, Taibei Shi da kuai wen hua chu ban gu fen you xian gong si.",, +chinese,廖忠俊. 著名狀元榜眼探花傳略,liao zhong jun. zhu ming zhuang yuan bang yan tan hua zhuan lüe,, +chinese,文學革命論 / 陳獨秀 -- 人的文學 / 周作人 -- 新文學運動的意義 / 張我軍.,wen xue ge ming lun / chen du xiu -- ren de wen xue / zhou zuo ren -- xin wen xue yun dong de yi yi / zhang wo jun.,, +belarusian,Пётр Клімук : жыццё і подзвіг касманаўта,Pi︠o︡tr Klimuk : z︠h︡ytstsi︠o︡ i podzvih kasmanaŭta,, +greek_classical,Ἡσιόδου τοῦ Ἀσκραίου Ἔργα καὶ ἡμέραι,Hēsiodou tou Askraiou Erga kai hēmerai,, +korean_names,간규찬,Kan Kyu-ch'an,,Hangul; from Y. Lee, +korean_names,강감찬,Kang Kam-ch'an,,Hangul; from Y. Lee, +korean_nonames,내 나름 대로 의 사랑,Nae narŭm taero ŭi sarang,"{""capitalize"": ""first""}",From K-Romanizer, +korean_nonames,내 마음 속 의 한국 문학,Nae maŭm sok ŭi Han'guk munhak,"{""capitalize"": ""first""}",From K-Romanizer, diff --git a/tests/test02_transliteration.py b/tests/test02_transliteration.py index 9767ee9..03c4744 100644 --- a/tests/test02_transliteration.py +++ b/tests/test02_transliteration.py @@ -8,7 +8,7 @@ from tests import TEST_DATA_DIR, reload_tables from scriptshifter.trans import transliterate -import scriptshifter.tables +from scriptshifter.tables import get_language logger = logging.getLogger(__name__) @@ -33,8 +33,8 @@ def sample_s2r(self): This function name won't start with `test_` otherwise will be automatically run without parameters. """ - config = scriptshifter.tables.load_table(self.tbl) - if "script_to_roman" in config: + config = get_language(self.tbl) + if config["has_s2r"]: txl = transliterate( self.script, self.tbl, capitalize=self.options.get("capitalize", False), @@ -51,8 +51,8 @@ def sample_r2s(self): This function name won't start with `test_` otherwise will be automatically run without parameters. """ - config = scriptshifter.tables.load_table(self.tbl) - if "roman_to_script" in config: + config = get_language(self.tbl) + if config["has_r2s"]: txl = transliterate( self.roman, self.tbl, t_dir="r2s", @@ -68,25 +68,24 @@ def make_suite(): """ Build parametrized test cases. """ - if "TXL_CONFIG_TABLE_DIR" in environ: - del environ["TXL_CONFIG_TABLE_DIR"] reload_tables() suite = TestSuite() - for fpath in glob(path.join(TEST_DATA_DIR, "script_samples", "*.csv")): - with open(fpath, newline="") as fh: - csv = reader(fh) - for row in csv: - if len(row[0]): - # Inject transliteration info in the test case. - for tname in ("sample_s2r", "sample_r2s"): - tcase = TestTrans(tname) - tcase.tbl = row[0] - tcase.script = row[1].strip() - tcase.roman = row[2].strip() - tcase.options = jloads(row[3]) if len(row[3]) else {} - suite.addTest(tcase) + with open(path.join( + TEST_DATA_DIR, "script_samples", "unittest.csv" + ), newline="") as fh: + csv = reader(fh) + for row in csv: + if len(row[0]): + # Inject transliteration info in the test case. + for tname in ("sample_s2r", "sample_r2s"): + tcase = TestTrans(tname) + tcase.tbl = row[0] + tcase.script = row[1].strip() + tcase.roman = row[2].strip() + tcase.options = jloads(row[3]) if len(row[3]) else {} + suite.addTest(tcase) return suite From 90b9f4c1f1540f7de9584224b54d78e75bcd617d Mon Sep 17 00:00:00 2001 From: scossu Date: Sat, 16 Nov 2024 19:41:19 -0500 Subject: [PATCH 3/6] Do unit test properly. --- doc/rest_api.md | 2 +- scriptshifter/__init__.py | 2 +- scriptshifter/tables/__init__.py | 16 ++-- tests/__init__.py | 88 +++---------------- tests/data/config/index.yml | 6 -- tests/data/script_samples/unittest.csv | 16 ++-- tests/integration_tests.py | 58 ++++++++++++ tests/{data/config => tables/data}/_base1.yml | 0 tests/{data/config => tables/data}/_base2.yml | 0 tests/{data/config => tables/data}/_base3.yml | 0 .../config => tables/data}/cap_base1.yml | 0 .../config => tables/data}/cap_base2.yml | 0 .../config => tables/data}/cap_inherited.yml | 0 .../config => tables/data}/inherited.yml | 0 .../{data/config => tables/data}/ordering.yml | 0 tests/{data/config => tables/data}/rot3.yml | 0 tests/tables/index.yml | 14 +++ tests/test01_cfg.py | 63 +++++++------ tests/test02_transliteration.py | 16 ++-- tests/test03_capitalization.py | 17 ++-- tests/test04_rest_api.py | 36 ++++---- 21 files changed, 171 insertions(+), 163 deletions(-) delete mode 100644 tests/data/config/index.yml create mode 100644 tests/integration_tests.py rename tests/{data/config => tables/data}/_base1.yml (100%) rename tests/{data/config => tables/data}/_base2.yml (100%) rename tests/{data/config => tables/data}/_base3.yml (100%) rename tests/{data/config => tables/data}/cap_base1.yml (100%) rename tests/{data/config => tables/data}/cap_base2.yml (100%) rename tests/{data/config => tables/data}/cap_inherited.yml (100%) rename tests/{data/config => tables/data}/inherited.yml (100%) rename tests/{data/config => tables/data}/ordering.yml (100%) rename tests/{data/config => tables/data}/rot3.yml (100%) create mode 100644 tests/tables/index.yml diff --git a/doc/rest_api.md b/doc/rest_api.md index b4712c3..94bf4bb 100644 --- a/doc/rest_api.md +++ b/doc/rest_api.md @@ -73,7 +73,7 @@ MIME type: `application/json` Content: JSON object with the following keys: -- `lang`: Language code as given by the `/languages` endpoint. +- `lang`: Language code as given by the `/languages` endpoint. - `text`: Input text to be transliterated. - `capitalize`: One of `first` (capitalize the first letter of the input), `all` (capitalize all words separated by spaces), or null (default: apply no diff --git a/scriptshifter/__init__.py b/scriptshifter/__init__.py index e9a4e6f..d6adb57 100644 --- a/scriptshifter/__init__.py +++ b/scriptshifter/__init__.py @@ -15,7 +15,7 @@ This DB stores all the runtime transliteration data. """ DB_PATH = environ.get( - "DB_PATH", path.join(APP_ROOT, "data", "scriptshifter.db")) + "TXL_DB_PATH", path.join(APP_ROOT, "data", "scriptshifter.db")) """ SMTP server for sending email. For a dummy server that just echoes the diff --git a/scriptshifter/tables/__init__.py b/scriptshifter/tables/__init__.py index 0b576ca..ee6126d 100644 --- a/scriptshifter/tables/__init__.py +++ b/scriptshifter/tables/__init__.py @@ -28,9 +28,6 @@ """ -TMP_DB_PATH = path.join( - path.dirname(DB_PATH), "~tmp." + path.basename(DB_PATH)) - DEFAULT_TABLE_DIR = path.join(path.dirname(path.realpath(__file__)), "data") # Can be overridden for tests. TABLE_DIR = environ.get("TXL_CONFIG_TABLE_DIR", DEFAULT_TABLE_DIR) @@ -152,6 +149,8 @@ def init_db(): # Create parent diretories if necessary. # If the DB already exists, it will be overwritten ONLY on success at # this point. + TMP_DB_PATH = path.join( + path.dirname(DB_PATH), "~tmp." + path.basename(DB_PATH)) if path.isfile(TMP_DB_PATH): # Remove previous temp file (possibly from failed attempt) unlink(TMP_DB_PATH) @@ -176,6 +175,7 @@ def init_db(): # If the DB already exists, it will be overwritten ONLY on success at # thhis point. move(TMP_DB_PATH, DB_PATH) + logger.info(f"Database initialized at {DB_PATH}.") finally: conn.close() if path.isfile(TMP_DB_PATH): @@ -520,6 +520,10 @@ def get_language(lang): if len(s2r_hooks): data["script_to_roman"]["hooks"] = s2r_hooks + double_cap = get_lang_dcap(conn, lang_id) + if len(double_cap): + data["script_to_roman"]["double_cap"] = double_cap + # Roman to script map, ignore list, and hooks. if data["has_r2s"]: @@ -541,10 +545,6 @@ def get_language(lang): if len(opt_data): data["options"] = opt_data - double_cap = get_lang_dcap(conn, lang_id) - if len(double_cap): - data["double_cap"] = double_cap - conn.close() return data @@ -652,7 +652,7 @@ def get_lang_hooks(conn, lang_id, t_dir): } ) - return hooks + return dict(hooks) def get_lang_dcap(conn, lang_id): diff --git a/tests/__init__.py b/tests/__init__.py index 50725a7..4d43854 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -1,85 +1,17 @@ -from csv import reader -from difflib import ndiff -from glob import glob from importlib import reload -from json import loads as jloads -from logging import getLogger -from os import environ, path +from os import path, environ +from tempfile import gettempdir -from scriptshifter.trans import transliterate +import scriptshifter +from scriptshifter import tables TEST_DIR = path.dirname(path.realpath(__file__)) TEST_DATA_DIR = path.join(TEST_DIR, "data") +TEST_CONFIG_DIR = path.join(TEST_DIR, "tables", "data") -logger = getLogger(__name__) - - -def reload_tables(): - if "TXL_CONFIG_TABLE_DIR" in environ: - del environ["TXL_CONFIG_TABLE_DIR"] - - # import here to set modified test config dir. - from scriptshifter import tables - - tables.init_db() - - for fname in glob(path.join(TEST_DATA_DIR, "config", ".yml")): - tname = path.splitext(path.basename(filename))[1] - with tables.get_connection() as conn: - tables.populate_table(conn, tname, {"name": fname}) - - - tables.list_tables.cache_clear() - tables.get_language.cache_clear() - tables.get_lang_map.cache_clear() - - return tables - - -def test_sample(dset): - """ - Test an individual sample set and produce a human-readable report. - - Used outside of automated tests. - - @param dset (str): sample set name (without the .csv extension) found in - the `data/script_samples` directory. - """ - deltas = [] - dset_fpath = path.join(TEST_DATA_DIR, "script_samples", dset + ".csv") - log_fpath = path.join(TEST_DATA_DIR, f"test_{dset}.log") - - with open(dset_fpath, newline="") as fh: - csv = reader(fh) - i = 1 - for row in csv: - logger.info(f"CSV row #{i}") - i += 1 - lang, script, rom = row[:3] - if not lang: - continue - opts = jloads(row[3]) if len(row) > 3 and row[3] else {} - trans, warnings = transliterate( - script, lang, t_dir="s2r", - capitalize=opts.get("capitalize"), options=opts) - if (trans == rom): - print(".", end="") - else: - print("F", end="") - deltas.append((lang, script, ndiff([trans], [rom]))) - - with open(log_fpath, "w") as fh: - # If no deltas, just truncate the file. - for lang, script, delta in deltas: - fh.write(f"Language: {lang}\n") - fh.write(f"Original: {script}\nDiff (result vs. expected):\n") - for dline in delta: - fh.write(dline.strip() + "\n") - fh.write("\n\n") - - ct = len(deltas) - if ct > 0: - print(f"{ct} failed tests. See report at {log_fpath}") - else: - print("All tests passed.") +# Reload main SS modules after changing environment variables. +environ["TXL_DB_PATH"] = path.join(gettempdir(), "scriptshifter_unittest.db") +reload(scriptshifter) +environ["TXL_CONFIG_TABLE_DIR"] = TEST_CONFIG_DIR +reload(tables) diff --git a/tests/data/config/index.yml b/tests/data/config/index.yml deleted file mode 100644 index 489e169..0000000 --- a/tests/data/config/index.yml +++ /dev/null @@ -1,6 +0,0 @@ -inherited: - name: Test inherited table -ordering: - name: Test ordering -rot3: - name: Test ROT3 hooks diff --git a/tests/data/script_samples/unittest.csv b/tests/data/script_samples/unittest.csv index 432d293..0113c11 100644 --- a/tests/data/script_samples/unittest.csv +++ b/tests/data/script_samples/unittest.csv @@ -1,9 +1,7 @@ -chinese,從易經解維摩詰經,臺北市大塊文化出版股份有限公司。,"cong yi jing jie wei mo jie jing, Taibei Shi da kuai wen hua chu ban gu fen you xian gong si.",, -chinese,廖忠俊. 著名狀元榜眼探花傳略,liao zhong jun. zhu ming zhuang yuan bang yan tan hua zhuan lüe,, -chinese,文學革命論 / 陳獨秀 -- 人的文學 / 周作人 -- 新文學運動的意義 / 張我軍.,wen xue ge ming lun / chen du xiu -- ren de wen xue / zhou zuo ren -- xin wen xue yun dong de yi yi / zhang wo jun.,, -belarusian,Пётр Клімук : жыццё і подзвіг касманаўта,Pi︠o︡tr Klimuk : z︠h︡ytstsi︠o︡ i podzvih kasmanaŭta,, -greek_classical,Ἡσιόδου τοῦ Ἀσκραίου Ἔργα καὶ ἡμέραι,Hēsiodou tou Askraiou Erga kai hēmerai,, -korean_names,간규찬,Kan Kyu-ch'an,,Hangul; from Y. Lee, -korean_names,강감찬,Kang Kam-ch'an,,Hangul; from Y. Lee, -korean_nonames,내 나름 대로 의 사랑,Nae narŭm taero ŭi sarang,"{""capitalize"": ""first""}",From K-Romanizer, -korean_nonames,내 마음 속 의 한국 문학,Nae maŭm sok ŭi Han'guk munhak,"{""capitalize"": ""first""}",From K-Romanizer, +inherited,abcd,ABCD,, +inherited,ABCD,abcd,"{""dir"": ""r2s""}", +inherited,ab,90,, +rot3,abcd,defg,, +rot3,HIJK,KLMN,, +rot3,pqrs,Pqrs,"{""capitalize"": ""first""}", +rot3,pqrs,PQRS,"{""capitalize"": ""all""}", diff --git a/tests/integration_tests.py b/tests/integration_tests.py new file mode 100644 index 0000000..e079664 --- /dev/null +++ b/tests/integration_tests.py @@ -0,0 +1,58 @@ +from csv import reader +from difflib import ndiff +from json import loads as jloads +from logging import getLogger +from os import path + +from scriptshifter.trans import transliterate +from tests import TEST_DATA_DIR + +logger = getLogger(__name__) + + +def test_sample(dset): + """ + Test an individual sample set and produce a human-readable report. + + Used outside of automated tests. + + @param dset (str): sample set name (without the .csv extension) found in + the `data/script_samples` directory. + """ + deltas = [] + dset_fpath = path.join(TEST_DATA_DIR, "script_samples", dset + ".csv") + log_fpath = path.join(TEST_DATA_DIR, f"test_{dset}.log") + + with open(dset_fpath, newline="") as fh: + csv = reader(fh) + i = 1 + for row in csv: + logger.info(f"CSV row #{i}") + i += 1 + lang, script, rom = row[:3] + if not lang: + continue + opts = jloads(row[3]) if len(row) > 3 and row[3] else {} + trans, warnings = transliterate( + script, lang, t_dir="s2r", + capitalize=opts.get("capitalize"), options=opts) + if (trans == rom): + print(".", end="") + else: + print("F", end="") + deltas.append((lang, script, ndiff([trans], [rom]))) + + with open(log_fpath, "w") as fh: + # If no deltas, just truncate the file. + for lang, script, delta in deltas: + fh.write(f"Language: {lang}\n") + fh.write(f"Original: {script}\nDiff (result vs. expected):\n") + for dline in delta: + fh.write(dline.strip() + "\n") + fh.write("\n\n") + + ct = len(deltas) + if ct > 0: + print(f"{ct} failed tests. See report at {log_fpath}") + else: + print("All tests passed.") diff --git a/tests/data/config/_base1.yml b/tests/tables/data/_base1.yml similarity index 100% rename from tests/data/config/_base1.yml rename to tests/tables/data/_base1.yml diff --git a/tests/data/config/_base2.yml b/tests/tables/data/_base2.yml similarity index 100% rename from tests/data/config/_base2.yml rename to tests/tables/data/_base2.yml diff --git a/tests/data/config/_base3.yml b/tests/tables/data/_base3.yml similarity index 100% rename from tests/data/config/_base3.yml rename to tests/tables/data/_base3.yml diff --git a/tests/data/config/cap_base1.yml b/tests/tables/data/cap_base1.yml similarity index 100% rename from tests/data/config/cap_base1.yml rename to tests/tables/data/cap_base1.yml diff --git a/tests/data/config/cap_base2.yml b/tests/tables/data/cap_base2.yml similarity index 100% rename from tests/data/config/cap_base2.yml rename to tests/tables/data/cap_base2.yml diff --git a/tests/data/config/cap_inherited.yml b/tests/tables/data/cap_inherited.yml similarity index 100% rename from tests/data/config/cap_inherited.yml rename to tests/tables/data/cap_inherited.yml diff --git a/tests/data/config/inherited.yml b/tests/tables/data/inherited.yml similarity index 100% rename from tests/data/config/inherited.yml rename to tests/tables/data/inherited.yml diff --git a/tests/data/config/ordering.yml b/tests/tables/data/ordering.yml similarity index 100% rename from tests/data/config/ordering.yml rename to tests/tables/data/ordering.yml diff --git a/tests/data/config/rot3.yml b/tests/tables/data/rot3.yml similarity index 100% rename from tests/data/config/rot3.yml rename to tests/tables/data/rot3.yml diff --git a/tests/tables/index.yml b/tests/tables/index.yml new file mode 100644 index 0000000..c45f9eb --- /dev/null +++ b/tests/tables/index.yml @@ -0,0 +1,14 @@ +inherited: + name: Test inheritance leaf file + marc_code: inh + description: Test description. +cap_base1: + name: Test capitalization base 1 +cap_base2: + name: Test capitalization base 2 +cap_inherited: + name: Test capitalization +ordering: + name: Test ordering +rot3: + name: Test ROT3 hooks diff --git a/tests/test01_cfg.py b/tests/test01_cfg.py index c861d91..6b7466f 100644 --- a/tests/test01_cfg.py +++ b/tests/test01_cfg.py @@ -1,20 +1,22 @@ +from os import environ, unlink from unittest import TestCase -from os import environ +from scriptshifter.tables import get_language -import scriptshifter -from tests import TEST_DATA_DIR, reload_tables +def setUpModule(): + from scriptshifter.tables import init_db + init_db() + + +def tearDownModule(): + unlink(environ["TXL_DB_PATH"]) class TestConfig(TestCase): """ Test configuration parsing. """ - def setUp(self): - environ["TXL_CONFIG_TABLE_DIR"] = TEST_DATA_DIR - self.tables = reload_tables() - def test_ordering(self): - tbl = self.tables.load_table("ordering") + tbl = get_language("ordering") exp_order = ["ABCD", "AB", "A", "BCDE", "BCD", "BEFGH", "B"] self.assertEqual( @@ -23,19 +25,17 @@ def test_ordering(self): class TestOverride(TestCase): """ Test configuration overrides. """ - def setUp(self): - environ["TXL_CONFIG_TABLE_DIR"] = TEST_DATA_DIR - self.tables = reload_tables() - def test_override_map(self): - tbl = self.tables.load_table("inherited") + tbl = get_language("inherited") - self.assertEqual(tbl["general"]["name"], "Test inheritance leaf file") + self.assertEqual(tbl["label"], "Test inheritance leaf file") + self.assertEqual(tbl["marc_code"], "inh") + self.assertEqual(tbl["description"], "Test description.") # Entries are additive. self.assertEqual( tbl["roman_to_script"]["ignore"], - ["Fritter my wig", "Hi", "Ho", "Thing-um-a-jig"]) + ("Fritter my wig", "Hi", "Ho", "Thing-um-a-jig")) self.assertEqual( tbl["roman_to_script"]["map"], ( @@ -102,34 +102,31 @@ def test_override_map(self): class TestHooks(TestCase): """ Test parsing of hook functions. """ - def setUp(self): - environ["TXL_CONFIG_TABLE_DIR"] = TEST_DATA_DIR - self.tables = reload_tables() - def test_rot3(self): - tbl = self.tables.load_table("rot3") + tbl = get_language("rot3") self.assertEqual( - tbl["script_to_roman"]["hooks"], - { - "begin_input_token": [ - ("test", scriptshifter.hooks.test.rotate, {"n": -3}) - ] - }) + tbl["script_to_roman"]["hooks"], + { + "begin_input_token": [ + { + "module_name": "test", + "fn_name": "rotate", + "kwargs": {"n": -3}, + } + ] + } + ) class TestDoubleCaps(TestCase): """ Test double capitalization configuration. """ - def setUp(self): - environ["TXL_CONFIG_TABLE_DIR"] = TEST_DATA_DIR - self.tables = reload_tables() - def test_dcaps_base1(self): - cap_base1 = self.tables.load_table("cap_base1") + cap_base1 = get_language("cap_base1") assert "z︠h︡" in cap_base1["script_to_roman"]["double_cap"] def test_dcaps_base2(self): - cap_base2 = self.tables.load_table("cap_base2") + cap_base2 = get_language("cap_base2") dcap = cap_base2["script_to_roman"]["double_cap"] assert len(dcap) == 2 @@ -137,7 +134,7 @@ def test_dcaps_base2(self): assert "i︠o︡" in dcap def test_dcaps_inherited(self): - cap_inherited = self.tables.load_table("cap_inherited") + cap_inherited = get_language("cap_inherited") dcap = cap_inherited["script_to_roman"]["double_cap"] assert len(dcap) == 1 diff --git a/tests/test02_transliteration.py b/tests/test02_transliteration.py index 03c4744..eda188a 100644 --- a/tests/test02_transliteration.py +++ b/tests/test02_transliteration.py @@ -2,18 +2,26 @@ from unittest import TestCase, TestSuite, TextTestRunner from csv import reader -from glob import glob from json import loads as jloads -from os import environ, path +from os import environ, path, unlink -from tests import TEST_DATA_DIR, reload_tables from scriptshifter.trans import transliterate from scriptshifter.tables import get_language +from tests import TEST_DATA_DIR logger = logging.getLogger(__name__) +def setUpModule(): + from scriptshifter.tables import init_db + init_db() + + +def tearDownModule(): + unlink(environ["TXL_DB_PATH"]) + + class TestTrans(TestCase): """ Test S2R transliteration. @@ -68,8 +76,6 @@ def make_suite(): """ Build parametrized test cases. """ - reload_tables() - suite = TestSuite() with open(path.join( diff --git a/tests/test03_capitalization.py b/tests/test03_capitalization.py index 085cdf4..06acecd 100644 --- a/tests/test03_capitalization.py +++ b/tests/test03_capitalization.py @@ -1,19 +1,22 @@ -from os import environ +from os import environ, unlink from unittest import TestCase from scriptshifter.trans import transliterate -from tests import TEST_DATA_DIR, reload_tables + + +def setUpModule(): + from scriptshifter.tables import init_db + init_db() + + +def tearDownModule(): + unlink(environ["TXL_DB_PATH"]) class TestCapitalization(TestCase): """ Test capitalization. """ - - def setUp(self): - environ["TXL_CONFIG_TABLE_DIR"] = TEST_DATA_DIR - self.tables = reload_tables() - def test_cap(self): tbl = "cap_inherited" in_str = "зг іо" diff --git a/tests/test04_rest_api.py b/tests/test04_rest_api.py index 1bccd58..1f54b0e 100644 --- a/tests/test04_rest_api.py +++ b/tests/test04_rest_api.py @@ -1,25 +1,28 @@ import json -from os import environ +from os import environ, unlink from unittest import TestCase from scriptshifter.rest_api import app -from tests import TEST_DATA_DIR, reload_tables EP = "http://localhost:8000" +def setUpModule(): + from scriptshifter.tables import init_db + init_db() + + +def tearDownModule(): + unlink(environ["TXL_DB_PATH"]) + + class TestRestAPI(TestCase): """ Test REST API interaction. """ - def setUp(self): - environ["TXL_CONFIG_TABLE_DIR"] = TEST_DATA_DIR - # if "TXL_CONFIG_TABLE_DIR" in environ: - # del environ["TXL_CONFIG_TABLE_DIR"] - reload_tables() - - # Start webapp. - app.testing = True + # def setUp(self): + # # Start webapp. + # app.testing = True def test_health(self): with app.test_client() as c: @@ -35,7 +38,7 @@ def test_language_list(self): data = json.loads(rsp.get_data(as_text=True)) self.assertIn("inherited", data) - self.assertIn("name", data["inherited"]) + self.assertIn("label", data["inherited"]) self.assertNotIn("_base1", data) self.assertNotIn("_base2", data) self.assertNotIn("_base3", data) @@ -47,14 +50,17 @@ def test_lang_table(self): self.assertEqual(rsp.status_code, 200) data = json.loads(rsp.get_data(as_text=True)) - self.assertIn("general", data) + self.assertIn("case_sensitive", data) + self.assertIn("description", data) self.assertIn("roman_to_script", data) self.assertIn("map", data["roman_to_script"]) + self.assertEqual(data["has_r2s"], True) + self.assertEqual(data["has_s2r"], False) self.assertEqual(data["roman_to_script"]["map"][0], ["ABCD", ""]) def test_trans_api_s2r(self): with app.test_client() as c: - rsp = c.post("/trans", data={"lang": "rot3", "text": "defg"}) + rsp = c.post("/trans", json={"lang": "rot3", "text": "defg"}) self.assertEqual(rsp.status_code, 200) data = json.loads(rsp.get_data(as_text=True)) @@ -64,7 +70,7 @@ def test_trans_api_s2r(self): def test_trans_api_r2s(self): with app.test_client() as c: rsp = c.post( - "/trans", data={ + "/trans", json={ "lang": "rot3", "text": "abcd", "t_dir": "r2s" @@ -80,7 +86,7 @@ def test_trans_api_capitalize(self): with app.test_client() as c: rsp = c.post( "/trans", - data={ + json={ "lang": "rot3", "capitalize": "first", "text": "bcde", From 7e0722d359fb479f28450f1615cb2c51cccb023e Mon Sep 17 00:00:00 2001 From: scossu Date: Sun, 17 Nov 2024 19:53:21 -0500 Subject: [PATCH 4/6] Pass minimum test set. --- tests/data/script_samples/unittest.csv | 13 ++++---- tests/test01_cfg.py | 3 +- tests/test02_transliteration.py | 44 +++++++------------------- tests/test03_capitalization.py | 2 +- tests/test04_rest_api.py | 2 +- 5 files changed, 20 insertions(+), 44 deletions(-) diff --git a/tests/data/script_samples/unittest.csv b/tests/data/script_samples/unittest.csv index 0113c11..eb4a1f6 100644 --- a/tests/data/script_samples/unittest.csv +++ b/tests/data/script_samples/unittest.csv @@ -1,7 +1,6 @@ -inherited,abcd,ABCD,, -inherited,ABCD,abcd,"{""dir"": ""r2s""}", -inherited,ab,90,, -rot3,abcd,defg,, -rot3,HIJK,KLMN,, -rot3,pqrs,Pqrs,"{""capitalize"": ""first""}", -rot3,pqrs,PQRS,"{""capitalize"": ""all""}", +inherited,abcd,9078,, +inherited,TUVX,tuvx,"{""t_dir"": ""r2s""}", +rot3,defg,abcd,, +rot3,HIJK,KLMN,"{""t_dir"": ""r2s""}", +rot3,st uv,Vw xy,"{""t_dir"": ""r2s"", ""capitalize"": ""first""}", +rot3,st uv,Vw Xy,"{""t_dir"": ""r2s"", ""capitalize"": ""all""}", diff --git a/tests/test01_cfg.py b/tests/test01_cfg.py index 6b7466f..8acd127 100644 --- a/tests/test01_cfg.py +++ b/tests/test01_cfg.py @@ -1,11 +1,10 @@ from os import environ, unlink from unittest import TestCase -from scriptshifter.tables import get_language +from scriptshifter.tables import get_language, init_db def setUpModule(): - from scriptshifter.tables import init_db init_db() diff --git a/tests/test02_transliteration.py b/tests/test02_transliteration.py index eda188a..3c73de8 100644 --- a/tests/test02_transliteration.py +++ b/tests/test02_transliteration.py @@ -6,7 +6,7 @@ from os import environ, path, unlink from scriptshifter.trans import transliterate -from scriptshifter.tables import get_language +from scriptshifter.tables import get_language, init_db from tests import TEST_DATA_DIR @@ -14,7 +14,6 @@ def setUpModule(): - from scriptshifter.tables import init_db init_db() @@ -31,12 +30,9 @@ class TestTrans(TestCase): TODO use a comprehensive sample table and report errors for unsupported languages. """ - - maxDiff = None - - def sample_s2r(self): + def sample(self): """ - Test S2R transliteration for one CSV sample. + Test transliteration for one CSV row. This function name won't start with `test_` otherwise will be automatically run without parameters. @@ -45,6 +41,7 @@ def sample_s2r(self): if config["has_s2r"]: txl = transliterate( self.script, self.tbl, + t_dir=self.options.get("t_dir", "s2r"), capitalize=self.options.get("capitalize", False), options=self.options)[0] self.assertEqual( @@ -52,25 +49,6 @@ def sample_s2r(self): f"S2R transliteration error for {self.tbl}!\n" f"Original: {self.script}") - def sample_r2s(self): - """ - Test R2S transliteration for one CSV sample. - - This function name won't start with `test_` otherwise will be - automatically run without parameters. - """ - config = get_language(self.tbl) - if config["has_r2s"]: - txl = transliterate( - self.roman, self.tbl, - t_dir="r2s", - capitalize=self.options.get("capitalize", False), - options=self.options)[0] - self.assertEqual( - txl, self.script, - f"R2S transliteration error for {self.tbl}!\n" - f"Original: {self.roman}") - def make_suite(): """ @@ -85,13 +63,13 @@ def make_suite(): for row in csv: if len(row[0]): # Inject transliteration info in the test case. - for tname in ("sample_s2r", "sample_r2s"): - tcase = TestTrans(tname) - tcase.tbl = row[0] - tcase.script = row[1].strip() - tcase.roman = row[2].strip() - tcase.options = jloads(row[3]) if len(row[3]) else {} - suite.addTest(tcase) + tcase = TestTrans("sample") + tcase.tbl = row[0] + tcase.script = row[1].strip() + tcase.roman = row[2].strip() + tcase.options = jloads(row[3]) if len(row[3]) else {} + + suite.addTest(tcase) return suite diff --git a/tests/test03_capitalization.py b/tests/test03_capitalization.py index 06acecd..eaf53b8 100644 --- a/tests/test03_capitalization.py +++ b/tests/test03_capitalization.py @@ -2,10 +2,10 @@ from unittest import TestCase from scriptshifter.trans import transliterate +from scriptshifter.tables import init_db def setUpModule(): - from scriptshifter.tables import init_db init_db() diff --git a/tests/test04_rest_api.py b/tests/test04_rest_api.py index 1f54b0e..bf065f4 100644 --- a/tests/test04_rest_api.py +++ b/tests/test04_rest_api.py @@ -4,13 +4,13 @@ from unittest import TestCase from scriptshifter.rest_api import app +from scriptshifter.tables import init_db EP = "http://localhost:8000" def setUpModule(): - from scriptshifter.tables import init_db init_db() From efb27b8707b92f52e0c463b1a0cc5f1daa900ce3 Mon Sep 17 00:00:00 2001 From: scossu Date: Mon, 25 Nov 2024 15:42:16 -0500 Subject: [PATCH 5/6] WIP Add tests for regex ignore patterns. --- sscli | 2 +- tests/data/script_samples/unittest.csv | 2 ++ tests/tables/data/regex.yml | 11 +++++++++++ tests/tables/index.yml | 3 +++ tests/test02_transliteration.py | 7 +++++-- 5 files changed, 22 insertions(+), 3 deletions(-) create mode 100644 tests/tables/data/regex.yml diff --git a/sscli b/sscli index 154aaf2..dca7334 100755 --- a/sscli +++ b/sscli @@ -10,7 +10,7 @@ from os import path from scriptshifter import DB_PATH from scriptshifter.tables import init_db as _init_db -from tests import test_sample +from tests.integration_tests import test_sample @click.group() diff --git a/tests/data/script_samples/unittest.csv b/tests/data/script_samples/unittest.csv index eb4a1f6..79e12fa 100644 --- a/tests/data/script_samples/unittest.csv +++ b/tests/data/script_samples/unittest.csv @@ -4,3 +4,5 @@ rot3,defg,abcd,, rot3,HIJK,KLMN,"{""t_dir"": ""r2s""}", rot3,st uv,Vw xy,"{""t_dir"": ""r2s"", ""capitalize"": ""first""}", rot3,st uv,Vw Xy,"{""t_dir"": ""r2s"", ""capitalize"": ""all""}", +regex,Hello abc,Hello 907,"{""t_dir"": ""r2s""}", +regex,Hollo abc,Hollo 907,"{""t_dir"": ""r2s""}", diff --git a/tests/tables/data/regex.yml b/tests/tables/data/regex.yml new file mode 100644 index 0000000..6f15123 --- /dev/null +++ b/tests/tables/data/regex.yml @@ -0,0 +1,11 @@ +--- +# Test file for regex ignoring. + +general: + name: Test regex ignoring. + parents: + - inherited + +roman_to_script: + ignore_ptn: + - "[hH][aeu]llo" diff --git a/tests/tables/index.yml b/tests/tables/index.yml index c45f9eb..3b4aad4 100644 --- a/tests/tables/index.yml +++ b/tests/tables/index.yml @@ -1,3 +1,4 @@ +--- inherited: name: Test inheritance leaf file marc_code: inh @@ -10,5 +11,7 @@ cap_inherited: name: Test capitalization ordering: name: Test ordering +regex: + name: inherited config + regex ignore. rot3: name: Test ROT3 hooks diff --git a/tests/test02_transliteration.py b/tests/test02_transliteration.py index 3c73de8..9e3856b 100644 --- a/tests/test02_transliteration.py +++ b/tests/test02_transliteration.py @@ -38,10 +38,13 @@ def sample(self): automatically run without parameters. """ config = get_language(self.tbl) - if config["has_s2r"]: + t_dir = self.options.get("t_dir", "s2r") + if ( + t_dir == "s2r" and config["has_s2r"] + or t_dir == "r2s" and config["has_r2s"]): txl = transliterate( self.script, self.tbl, - t_dir=self.options.get("t_dir", "s2r"), + t_dir=t_dir, capitalize=self.options.get("capitalize", False), options=self.options)[0] self.assertEqual( From 91f3ad98ab1ce018d654a475c706a5250d42d12d Mon Sep 17 00:00:00 2001 From: scossu Date: Mon, 2 Dec 2024 22:24:03 -0500 Subject: [PATCH 6/6] Complete basic tests for regex ignore. --- scriptshifter/trans.py | 30 ++++++++++++++++++++++---- tests/data/script_samples/unittest.csv | 4 ++-- tests/tables/data/regex.yml | 10 ++++++++- 3 files changed, 37 insertions(+), 7 deletions(-) diff --git a/scriptshifter/trans.py b/scriptshifter/trans.py index 8f5a39e..0c8a74c 100644 --- a/scriptshifter/trans.py +++ b/scriptshifter/trans.py @@ -1,7 +1,7 @@ import logging from importlib import import_module -from re import compile +from re import Pattern, compile from scriptshifter.exceptions import BREAK, CONT from scriptshifter.tables import ( @@ -152,7 +152,7 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}): # token or exit the scanning loop altogether. hret = _run_hook("begin_input_token", ctx) if hret == BREAK: - Logger.debug("Breaking text scanning from hook signal.") + logger.debug("Breaking text scanning from hook signal.") break if hret == CONT: logger.debug("Skipping scanning iteration from hook signal.") @@ -170,8 +170,21 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}): if hret == CONT: continue - step = len(ctx.tk) - if ctx.tk == ctx.src[ctx.cur:ctx.cur + step]: + _matching = False + if type(ctx.tk) is Pattern: + # Seach RE pattern beginning at cursor. + if _ptn_match := ctx.tk.match(ctx.src[ctx.cur:]): + ctx.tk = _ptn_match[0] + logger.debug(f"Matched regex: {ctx.tk}") + step = len(ctx.tk) + _matching = True + else: + # Search exact match. + step = len(ctx.tk) + if ctx.tk == ctx.src[ctx.cur:ctx.cur + step]: + _matching = True + + if _matching: # The position matches an ignore token. hret = _run_hook("on_ignore_match", ctx) if hret == BREAK: @@ -182,6 +195,12 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}): logger.info(f"Ignored token: {ctx.tk}") ctx.dest_ls.append(ctx.tk) ctx.cur += step + if ctx.cur >= len(ctx.src): + # reached end of string. Stop ignoring. + # The outer loop will exit imediately after. + ctx.ignoring = False + break + cur_char = ctx.src[ctx.cur] ctx.ignoring = True break @@ -194,6 +213,9 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}): delattr(ctx, "tk") delattr(ctx, "ignoring") + if ctx.cur >= len(ctx.src): + break + # Begin transliteration token lookup. ctx.match = False diff --git a/tests/data/script_samples/unittest.csv b/tests/data/script_samples/unittest.csv index 79e12fa..fda09ce 100644 --- a/tests/data/script_samples/unittest.csv +++ b/tests/data/script_samples/unittest.csv @@ -4,5 +4,5 @@ rot3,defg,abcd,, rot3,HIJK,KLMN,"{""t_dir"": ""r2s""}", rot3,st uv,Vw xy,"{""t_dir"": ""r2s"", ""capitalize"": ""first""}", rot3,st uv,Vw Xy,"{""t_dir"": ""r2s"", ""capitalize"": ""all""}", -regex,Hello abc,Hello 907,"{""t_dir"": ""r2s""}", -regex,Hollo abc,Hollo 907,"{""t_dir"": ""r2s""}", +regex,Hello abc,Hello 678,"{""t_dir"": ""r2s""}", +regex,Hullo abc,5u22o 678,"{""t_dir"": ""r2s""}", diff --git a/tests/tables/data/regex.yml b/tests/tables/data/regex.yml index 6f15123..cf18a09 100644 --- a/tests/tables/data/regex.yml +++ b/tests/tables/data/regex.yml @@ -8,4 +8,12 @@ general: roman_to_script: ignore_ptn: - - "[hH][aeu]llo" + - "[hH][ae]llo" + + map: + "h": "1" + "H": "5" + "l": "2" + "a": "6" + "b": "7" + "c": "8"