From 408e57a42f873aaaba27da8457a32f2b7f5cf58f Mon Sep 17 00:00:00 2001
From: scossu <stefano@cossu.cc>
Date: Mon, 21 Oct 2024 21:04:07 -0400
Subject: [PATCH 1/6] WIP ignore by regular expression.

---
 scriptshifter/tables/__init__.py           |  48 ++++-----
 scriptshifter/tables/data/_ignore_base.yml | 111 +++------------------
 2 files changed, 32 insertions(+), 127 deletions(-)

diff --git a/scriptshifter/tables/__init__.py b/scriptshifter/tables/__init__.py
index a2098c9..725192f 100644
--- a/scriptshifter/tables/__init__.py
+++ b/scriptshifter/tables/__init__.py
@@ -1,5 +1,4 @@
 import logging
-import re
 import sqlite3
 
 from collections import defaultdict
@@ -7,6 +6,7 @@
 from importlib import import_module
 from json import dumps as jdumps, loads as jloads
 from os import R_OK, access, environ, makedirs, path, unlink
+from re import compile
 from shutil import move
 
 from yaml import load
@@ -247,20 +247,19 @@ def populate_table(conn, tid, tname):
                             hook_data[1].__name__, jdumps(hook_data[2])))
 
         # Ignore rules (R2S only).
-        for row in sec.get("ignore", []):
-            if isinstance(row, dict):
-                if "re" in row:
-                    flags = FEAT_RE
-                    rule = row["re"]
-            else:
-                flags = 0
-                rule = row
+        for rule in sec.get("ignore", []):
+            conn.execute(
+                    """INSERT INTO tbl_ignore (
+                        lang_id, rule, features
+                    ) VALUES (?, ?, ?)""",
+                    (tid, rule, 0))
 
+        for rule in sec.get("ignore_ptn", []):
             conn.execute(
                     """INSERT INTO tbl_ignore (
                         lang_id, rule, features
                     ) VALUES (?, ?, ?)""",
-                    (tid, rule, flags))
+                    (tid, rule, FEAT_RE))
 
         # Double caps (S2R only).
         for rule in sec.get("double_cap", []):
@@ -417,33 +416,22 @@ def load_table(tname):
 
         # Ignore regular expression patterns.
         # Patterns are evaluated in the order they are listed in the config.
-        ignore_ptn = [
-                re.compile(ptn)
-                for ptn in tdata["roman_to_script"].get("ignore_ptn", [])]
+        ignore_ptn = tdata["roman_to_script"].get("ignore_ptn", [])
         for parent in parents:
             parent_tdata = load_table(parent)
             # NOTE: duplicates are not removed.
-            ignore_ptn = [
-                re.compile(ptn)
-                for ptn in parent_tdata.get(
-                        "roman_to_script", {}).get("ignore_ptn", [])
-            ] + ignore_ptn
+            ignore_ptn = parent_tdata.get(
+                    "roman_to_script", {}).get("ignore_ptn", []) + ignore_ptn
         tdata["roman_to_script"]["ignore_ptn"] = ignore_ptn
 
         # Ignore plain strings.
-        ignore = {
-            Token(t)
-            for t in tdata["roman_to_script"].get("ignore", [])
-        }
+        ignore = set(tdata["roman_to_script"].get("ignore", []))
         for parent in parents:
             parent_tdata = load_table(parent)
             # No overriding occurs with the ignore list, only de-duplication.
-            ignore |= {
-                Token(t) for t in parent_tdata.get(
-                        "roman_to_script", {}).get("ignore", [])
-            }
-        tdata["roman_to_script"]["ignore"] = [
-                t.content for t in sorted(ignore)]
+            ignore |= set(parent_tdata.get(
+                        "roman_to_script", {}).get("ignore", []))
+        tdata["roman_to_script"]["ignore"] = sorted(ignore)
 
         # Hooks.
         if "hooks" in tdata["roman_to_script"]:
@@ -592,7 +580,9 @@ def get_lang_ignore(conn, lang_id):
             WHERE lang_id = ?""",
             (lang_id,))
     # Features (regular expressions) not implemented yet.
-    return tuple(row[0] for row in qry)
+    return tuple(
+            compile(row[0]) if row[1] & FEAT_RE else row[0]
+            for row in qry)
 
 
 @cache
diff --git a/scriptshifter/tables/data/_ignore_base.yml b/scriptshifter/tables/data/_ignore_base.yml
index 4937783..da083ac 100644
--- a/scriptshifter/tables/data/_ignore_base.yml
+++ b/scriptshifter/tables/data/_ignore_base.yml
@@ -16,106 +16,21 @@ roman_to_script:
     # dedicated U+2160÷U+216F (uppercase Roman
     # numerals) and/or U+2170÷U+217F (lower case Roman
     # numerals) ranges to avoid this ambiguity.
-    # TODO implement regular expressions for ignore patterns.
-    #- re: "I{2,3}"
-    #- re: "I(V|X)"
-    #- re: "LI{,3}"
-    #- re: "LI?(V|X)"
-    #- re: "L(V|X{1,3})I{,3}"
-    #- re: "LX{1,3}I?V"
-    #- re: "LX{1,3}VI{,3}"
-    #- re: "(V|X{1,3})I{,3}"
-    #- re: "X{1,3}I{,3}"
-    #- re: "X{1,3}I(V|X)"
-    #- re: "X{1,3}VI{,3}"
-    - "II"
-    - "III"
-    - "IV"
-    - "IX"
-    - "LI"
-    - "LII"
-    - "LIII"
-    - "LIV"
-    - "LIX"
-    - "LV"
-    - "LVI"
-    - "LVII"
-    - "LVIII"
-    - "LX"
-    - "LXI"
-    - "LXII"
-    - "LXIII"
-    - "LXIV"
-    - "LXIX"
-    - "LXV"
-    - "LXVI"
-    - "LXVII"
-    - "LXVIII"
-    - "LXX"
-    - "LXXI"
-    - "LXXII"
-    - "LXXIII"
-    - "LXXIV"
-    - "LXXIX"
-    - "LXXV"
-    - "LXXVI"
-    - "LXXVII"
-    - "LXXVIII"
-    - "LXXX"
-    - "LXXXI"
-    - "LXXXII"
-    - "LXXXIII"
-    - "LXXXIV"
-    - "LXXXIX"
-    - "LXXXV"
-    - "LXXXVI"
-    - "LXXXVII"
-    - "LXXXVIII"
-    - "VI"
-    - "VII"
-    - "VIII"
-    - "XI"
-    - "XII"
-    - "XIII"
-    - "XIV"
-    - "XIX"
-    - "XL"
-    - "XLI"
-    - "XLII"
-    - "XLIII"
-    - "XLIV"
-    - "XLIX"
-    - "XLV"
-    - "XLVI"
-    - "XLVII"
-    - "XLVIII"
-    - "XV"
-    - "XVI"
-    - "XVII"
-    - "XVIII"
-    - "XX"
-    - "XXI"
-    - "XXII"
-    - "XXIII"
-    - "XXIV"
-    - "XXIX"
-    - "XXV"
-    - "XXVI"
-    - "XXVII"
-    - "XXVIII"
-    - "XXX"
-    - "XXXI"
-    - "XXXII"
-    - "XXXIII"
-    - "XXXIV"
-    - "XXXIX"
-    - "XXXV"
-    - "XXXVI"
-    - "XXXVII"
-    - "XXXVIII"
     - "and one other"
-    #- re: "and ([a-z0-9]+ )?others"
     - "et al."
+  ignore_ptn:
+    - "and ([a-z0-9]+ )?others"
+    - "I{2,3}"
+    - "I(V|X)"
+    - "LI{,3}"
+    - "LI?(V|X)"
+    - "L(V|X{1,3})I{,3}"
+    - "LX{1,3}I?V"
+    - "LX{1,3}VI{,3}"
+    - "(V|X{1,3})I{,3}"
+    - "X{1,3}I{,3}"
+    - "X{1,3}I(V|X)"
+    - "X{1,3}VI{,3}"
 
 script_to_roman:
   ignore:

From 6c5cab47431440099b9c6981e4f3a4acca489741 Mon Sep 17 00:00:00 2001
From: scossu <stefano@cossu.cc>
Date: Sat, 16 Nov 2024 17:08:12 -0500
Subject: [PATCH 2/6] WIP regexp and testing framework.

---
 example.env                               |   2 +
 legacy/processNumbers.ts                  | 144 ++++++++++++++++++++++
 scriptshifter/tables/__init__.py          |  40 +++---
 scriptshifter/tables/{data => }/index.yml |   0
 tests/__init__.py                         |  22 +++-
 tests/data/{ => config}/_base1.yml        |   0
 tests/data/{ => config}/_base2.yml        |   0
 tests/data/{ => config}/_base3.yml        |   0
 tests/data/{ => config}/cap_base1.yml     |   0
 tests/data/{ => config}/cap_base2.yml     |   0
 tests/data/{ => config}/cap_inherited.yml |   0
 tests/data/{ => config}/index.yml         |   0
 tests/data/{ => config}/inherited.yml     |   0
 tests/data/{ => config}/ordering.yml      |   0
 tests/data/{ => config}/rot3.yml          |   0
 tests/data/script_samples/unittest.csv    |   9 ++
 tests/test02_transliteration.py           |  39 +++---
 17 files changed, 217 insertions(+), 39 deletions(-)
 create mode 100644 legacy/processNumbers.ts
 rename scriptshifter/tables/{data => }/index.yml (100%)
 rename tests/data/{ => config}/_base1.yml (100%)
 rename tests/data/{ => config}/_base2.yml (100%)
 rename tests/data/{ => config}/_base3.yml (100%)
 rename tests/data/{ => config}/cap_base1.yml (100%)
 rename tests/data/{ => config}/cap_base2.yml (100%)
 rename tests/data/{ => config}/cap_inherited.yml (100%)
 rename tests/data/{ => config}/index.yml (100%)
 rename tests/data/{ => config}/inherited.yml (100%)
 rename tests/data/{ => config}/ordering.yml (100%)
 rename tests/data/{ => config}/rot3.yml (100%)
 create mode 100644 tests/data/script_samples/unittest.csv

diff --git a/example.env b/example.env
index 004c0d4..5e30eb8 100644
--- a/example.env
+++ b/example.env
@@ -2,4 +2,6 @@ FLASK_DEBUG=true
 TXL_DICTA_EP="changeme"
 TXL_FLASK_SECRET="changeme"
 TXL_LOGLEVEL="INFO"
+TXL_EMAIL_FROM="me@loc.gov"
+TXL_EMAIL_TO="me@loc.gov"
 LOSHN_KOYDESH_O2P_SRC="${PWD}/scriptshifter/hooks/yiddish_/loshn_koydesh_o2p_override.tsv"
diff --git a/legacy/processNumbers.ts b/legacy/processNumbers.ts
new file mode 100644
index 0000000..691e5ab
--- /dev/null
+++ b/legacy/processNumbers.ts
@@ -0,0 +1,144 @@
+private processNumbers(pinyinString: string, tag: string, code: string): string {
+    let outputString = "";
+    let useNumVersion = false;
+    //useNumVersion is set in specific subfields where we definitely want to treat numbers as numbers
+    if ((tag == "245" || tag == "830") && code == "n") {
+       useNumVersion = true;
+    }
+
+    /*
+     * The input string is split, with any space or punctuation character (except for #) as the delimiter.
+     * The delimiters will be captured and included in the string of tokens.  Only the even-numbered
+     * array elements are the true 'tokens', so the code for processing tokens is run only for even
+     * values of j.
+     */
+    let tokens: string[] = pinyinString.split(new RegExp("([^\\P{P}#]|\\s)","u"));
+    let numTokenPattern = "^([A-Za-z]+)#([0-9]*)$";
+    let numToken_re = new RegExp(numTokenPattern);
+    let n = tokens.length
+    //this.alert.info(tokens.join("|"),{autoClose: false})
+    for (let i = 0; i < n; i++) {
+        let toki = tokens[i];
+        if (toki.match(numToken_re)) {
+            /*
+             * When a numerical token (containing #) is reached, the inner loop consumes it and all consecutive numerical tokens
+             * found after it.  Two versions of the string are maintained.  The textVersion is the original pinyin (minus the
+             * # suffixes).  In the numVersion, characters representing numbers are converted to Arabic numerals.  When a
+             * non-numerical token (or end of string) is encountered, the string of numerical tokens is evaluated to determine
+             * which version should be used in the output string.  The outer loop then continues where the inner loop left off.
+             */
+            let textVersion = "";
+            let numVersion = "";
+            for (let j = i; j < n; j++) {
+                let tokj = tokens[j];
+                /* a token without # (or the end of string) is reached */
+                if ((j % 2 == 0 && !tokj.match(numToken_re)) || j == n - 1) {
+                    //If this runs, then we are on the last token and it is numeric. Add text after # (if present) to numerical version
+                    let m = tokj.match(numToken_re);
+                    if (m) {
+                        textVersion += m[1]
+                        if (m[2] == "") {
+                            numVersion += m[1];
+                        } else {
+                            numVersion += m[2];
+                        }
+                    } else if (j == n - 1) {
+                    //if last token is non-numerical, just tack it on.
+                        textVersion += tokj;
+                        numVersion += tokj;
+                    } else if (textVersion.length > 0 && numVersion.length > 0) {
+                    //if not at end of string yet and token is non-numerical, remove the last delimiter that was appended
+                    //(outer loop will pick up at this point)
+                        textVersion = textVersion.substring(0, textVersion.length - 1);
+                        numVersion = numVersion.substring(0, numVersion.length - 1);
+                    }
+                    //evaluate numerical string that has been constructed so far
+                    //use num version for ordinals and date strings
+                    if (numVersion.match(/^di [0-9]/i) ||
+                        numVersion.match(/[0-9] [0-9] [0-9] [0-9]/) ||
+                        numVersion.match(/[0-9]+ nian [0-9]+ yue/i) ||
+                        numVersion.match(/"[0-9]+ yue [0-9]+ ri/i) ||
+                        useNumVersion
+                       ) {
+                        useNumVersion = true;
+                        /*
+                         * At this point, string may contain literal translations of Chinese numerals
+                         * Convert these to Arabic numerals (for example "2 10 7" = "27").
+                         */
+
+                        while (numVersion.match(/[0-9] 10+/) || numVersion.match(/[1-9]0+ [1-9]/)) {
+                            m = numVersion.match(/([0-9]+) ([1-9]0+)/);
+                            if (m) {
+                                let sum = Number(m[1]) * Number(m[2]);
+                                numVersion = numVersion.replace(/[0-9]+ [1-9]0+/, String(sum));
+                            } else {
+                                let mb = numVersion.match(/([1-9]0+) ([0-9]+)/);
+                                if (mb)
+                                {
+                                    let sumb = Number(mb[1]) + Number(mb[2]);
+                                    numVersion = numVersion.replace(/[1-9]0+ [0-9]+/, String(sumb));
+                                }
+                                else
+                                {
+                                    break;
+                                }
+                            }
+                        }
+
+                        //A few other tweaks
+                        numVersion = numVersion.replace(/([0-9]) ([0-9]) ([0-9]) ([0-9])/g, "$1$2$3$4");
+                        if ((tag == "245" || tag == "830") && code == "n") {
+                            while (numVersion.match(/[0-9] [0-9]/)) {
+                                numVersion = numVersion.replace(/([0-9]) ([0-9])/, "$1$2");
+                            }
+                        }
+                    }
+                    if (useNumVersion)
+                    {
+                        outputString += numVersion;
+                    }
+                    else
+                    {
+                        outputString += textVersion;
+                    }
+                    //if the end of the string is not reached, backtrack to the delimiter after the last numerical token
+                    //(i.e. two tokens ago)
+                    if (j < n - 1)
+                    {
+                        i = j - 2;
+                    }
+                    else //we are at the end of the string, so we are done!
+                    {
+                        i = j;
+                    }
+                    break;
+                }
+                //this is run when we are not yet at the end of the string and have not yet reached a non-numerical token
+                //This is identical to the code that is run above when the last token is numeric.
+                if (j % 2 == 0)
+                {
+                    let m = tokj.match(numToken_re);
+                    textVersion += m[1];
+                    if (m[2]== "")
+                    {
+                        numVersion += m[1];
+                    }
+                    else
+                    {
+                        numVersion += m[2];
+                    }
+                }
+                else //a delimiter, just tack it on.
+                {
+                    textVersion += tokj;
+                    numVersion += tokj;
+                }
+            }
+        }
+        else // the outer loop has encountered a non-numeric token or delimiter, just tack it on.
+        {
+            outputString += toki;
+        }
+    }
+    return outputString;
+ }
diff --git a/scriptshifter/tables/__init__.py b/scriptshifter/tables/__init__.py
index 725192f..0b576ca 100644
--- a/scriptshifter/tables/__init__.py
+++ b/scriptshifter/tables/__init__.py
@@ -143,7 +143,7 @@ def init_db():
 
     This operation removes any preexisting database.
 
-    All tables in the index file (`./data/index.yml`) will be parsed
+    All tables in the index file (`./index.yml`) will be parsed
     (including inheritance rules) and loaded into the designated DB.
 
     This must be done only once at bootstrap. To update individual tables,
@@ -151,7 +151,7 @@ def init_db():
     """
     # Create parent diretories if necessary.
     # If the DB already exists, it will be overwritten ONLY on success at
-    # hhis point.
+    # this point.
     if path.isfile(TMP_DB_PATH):
         # Remove previous temp file (possibly from failed attempt)
         unlink(TMP_DB_PATH)
@@ -166,21 +166,12 @@ def init_db():
             conn.executescript(fh.read())
 
     # Populate tables.
-    with open(path.join(TABLE_DIR, "index.yml")) as fh:
+    with open(path.join(path.dirname(TABLE_DIR), "index.yml")) as fh:
         tlist = load(fh, Loader=Loader)
     try:
         with conn:
             for tname, tdata in tlist.items():
-                res = conn.execute(
-                    """INSERT INTO tbl_language (
-                        name, label, marc_code, description
-                    ) VALUES (?, ?, ?, ?)""",
-                    (
-                        tname, tdata.get("name"), tdata.get("marc_code"),
-                        tdata.get("description"),
-                    )
-                )
-                populate_table(conn, res.lastrowid, tname)
+                populate_table(conn, tname, tdata)
 
         # If the DB already exists, it will be overwritten ONLY on success at
         # thhis point.
@@ -201,7 +192,27 @@ def get_connection():
     return sqlite3.connect(DB_PATH)
 
 
-def populate_table(conn, tid, tname):
+def populate_table(conn, tname, tdata):
+    """
+    Populate an individual table with data from a configuration.
+
+    @param conn: SQLite connection.
+
+    @param tname(str): Table name.
+
+    @param tdata(dict): Table data.
+    """
+    res = conn.execute(
+        """INSERT INTO tbl_language (
+            name, label, marc_code, description
+        ) VALUES (?, ?, ?, ?)""",
+        (
+            tname, tdata.get("name"), tdata.get("marc_code"),
+            tdata.get("description"),
+        )
+    )
+    tid = res.lastrowid
+
     data = load_table(tname)
     flags = 0
     if "script_to_roman" in data:
@@ -579,7 +590,6 @@ def get_lang_ignore(conn, lang_id):
             """SELECT rule, features FROM tbl_ignore
             WHERE lang_id = ?""",
             (lang_id,))
-    # Features (regular expressions) not implemented yet.
     return tuple(
             compile(row[0]) if row[1] & FEAT_RE else row[0]
             for row in qry)
diff --git a/scriptshifter/tables/data/index.yml b/scriptshifter/tables/index.yml
similarity index 100%
rename from scriptshifter/tables/data/index.yml
rename to scriptshifter/tables/index.yml
diff --git a/tests/__init__.py b/tests/__init__.py
index e4cde3e..50725a7 100644
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -1,11 +1,10 @@
 from csv import reader
 from difflib import ndiff
+from glob import glob
 from importlib import reload
 from json import loads as jloads
 from logging import getLogger
-from os import path
-
-import scriptshifter.tables
+from os import environ, path
 
 from scriptshifter.trans import transliterate
 
@@ -17,8 +16,20 @@
 
 
 def reload_tables():
-    reload(scriptshifter.tables)  # Reload new config dir.
+    if "TXL_CONFIG_TABLE_DIR" in environ:
+        del environ["TXL_CONFIG_TABLE_DIR"]
+
+    # import here to set modified test config dir.
     from scriptshifter import tables
+
+    tables.init_db()
+
+    for fname in glob(path.join(TEST_DATA_DIR, "config", ".yml")):
+        tname = path.splitext(path.basename(filename))[1]
+        with tables.get_connection() as conn:
+            tables.populate_table(conn, tname, {"name": fname})
+
+
     tables.list_tables.cache_clear()
     tables.get_language.cache_clear()
     tables.get_lang_map.cache_clear()
@@ -41,7 +52,10 @@ def test_sample(dset):
 
     with open(dset_fpath, newline="") as fh:
         csv = reader(fh)
+        i = 1
         for row in csv:
+            logger.info(f"CSV row #{i}")
+            i += 1
             lang, script, rom = row[:3]
             if not lang:
                 continue
diff --git a/tests/data/_base1.yml b/tests/data/config/_base1.yml
similarity index 100%
rename from tests/data/_base1.yml
rename to tests/data/config/_base1.yml
diff --git a/tests/data/_base2.yml b/tests/data/config/_base2.yml
similarity index 100%
rename from tests/data/_base2.yml
rename to tests/data/config/_base2.yml
diff --git a/tests/data/_base3.yml b/tests/data/config/_base3.yml
similarity index 100%
rename from tests/data/_base3.yml
rename to tests/data/config/_base3.yml
diff --git a/tests/data/cap_base1.yml b/tests/data/config/cap_base1.yml
similarity index 100%
rename from tests/data/cap_base1.yml
rename to tests/data/config/cap_base1.yml
diff --git a/tests/data/cap_base2.yml b/tests/data/config/cap_base2.yml
similarity index 100%
rename from tests/data/cap_base2.yml
rename to tests/data/config/cap_base2.yml
diff --git a/tests/data/cap_inherited.yml b/tests/data/config/cap_inherited.yml
similarity index 100%
rename from tests/data/cap_inherited.yml
rename to tests/data/config/cap_inherited.yml
diff --git a/tests/data/index.yml b/tests/data/config/index.yml
similarity index 100%
rename from tests/data/index.yml
rename to tests/data/config/index.yml
diff --git a/tests/data/inherited.yml b/tests/data/config/inherited.yml
similarity index 100%
rename from tests/data/inherited.yml
rename to tests/data/config/inherited.yml
diff --git a/tests/data/ordering.yml b/tests/data/config/ordering.yml
similarity index 100%
rename from tests/data/ordering.yml
rename to tests/data/config/ordering.yml
diff --git a/tests/data/rot3.yml b/tests/data/config/rot3.yml
similarity index 100%
rename from tests/data/rot3.yml
rename to tests/data/config/rot3.yml
diff --git a/tests/data/script_samples/unittest.csv b/tests/data/script_samples/unittest.csv
new file mode 100644
index 0000000..432d293
--- /dev/null
+++ b/tests/data/script_samples/unittest.csv
@@ -0,0 +1,9 @@
+chinese,從易經解維摩詰經，臺北市大塊文化出版股份有限公司。,"cong yi jing jie wei mo jie jing, Taibei Shi da kuai wen hua chu ban gu fen you xian gong si.",,
+chinese,廖忠俊. 著名狀元榜眼探花傳略,liao zhong jun. zhu ming zhuang yuan bang yan tan hua zhuan lüe,,
+chinese,文學革命論 / 陳獨秀 -- 人的文學 / 周作人 -- 新文學運動的意義 / 張我軍.,wen xue ge ming lun / chen du xiu -- ren de wen xue / zhou zuo ren -- xin wen xue yun dong de yi yi / zhang wo jun.,,
+belarusian,Пётр Клімук : жыццё і подзвіг касманаўта,Pi︠o︡tr Klimuk : z︠h︡ytstsi︠o︡ i podzvih kasmanaŭta,,
+greek_classical,Ἡσιόδου τοῦ Ἀσκραίου Ἔργα καὶ ἡμέραι,Hēsiodou tou Askraiou Erga kai hēmerai,,
+korean_names,간규찬,Kan Kyu-ch'an,,Hangul; from Y. Lee,
+korean_names,강감찬,Kang Kam-ch'an,,Hangul; from Y. Lee,
+korean_nonames,내 나름 대로 의 사랑,Nae narŭm taero ŭi sarang,"{""capitalize"": ""first""}",From K-Romanizer,
+korean_nonames,내 마음 속 의 한국 문학,Nae maŭm sok ŭi Han'guk munhak,"{""capitalize"": ""first""}",From K-Romanizer,
diff --git a/tests/test02_transliteration.py b/tests/test02_transliteration.py
index 9767ee9..03c4744 100644
--- a/tests/test02_transliteration.py
+++ b/tests/test02_transliteration.py
@@ -8,7 +8,7 @@
 
 from tests import TEST_DATA_DIR, reload_tables
 from scriptshifter.trans import transliterate
-import scriptshifter.tables
+from scriptshifter.tables import get_language
 
 
 logger = logging.getLogger(__name__)
@@ -33,8 +33,8 @@ def sample_s2r(self):
         This function name won't start with `test_` otherwise will be
         automatically run without parameters.
         """
-        config = scriptshifter.tables.load_table(self.tbl)
-        if "script_to_roman" in config:
+        config = get_language(self.tbl)
+        if config["has_s2r"]:
             txl = transliterate(
                     self.script, self.tbl,
                     capitalize=self.options.get("capitalize", False),
@@ -51,8 +51,8 @@ def sample_r2s(self):
         This function name won't start with `test_` otherwise will be
         automatically run without parameters.
         """
-        config = scriptshifter.tables.load_table(self.tbl)
-        if "roman_to_script" in config:
+        config = get_language(self.tbl)
+        if config["has_r2s"]:
             txl = transliterate(
                     self.roman, self.tbl,
                     t_dir="r2s",
@@ -68,25 +68,24 @@ def make_suite():
     """
     Build parametrized test cases.
     """
-    if "TXL_CONFIG_TABLE_DIR" in environ:
-        del environ["TXL_CONFIG_TABLE_DIR"]
     reload_tables()
 
     suite = TestSuite()
 
-    for fpath in glob(path.join(TEST_DATA_DIR, "script_samples", "*.csv")):
-        with open(fpath, newline="") as fh:
-            csv = reader(fh)
-            for row in csv:
-                if len(row[0]):
-                    # Inject transliteration info in the test case.
-                    for tname in ("sample_s2r", "sample_r2s"):
-                        tcase = TestTrans(tname)
-                        tcase.tbl = row[0]
-                        tcase.script = row[1].strip()
-                        tcase.roman = row[2].strip()
-                        tcase.options = jloads(row[3]) if len(row[3]) else {}
-                        suite.addTest(tcase)
+    with open(path.join(
+        TEST_DATA_DIR, "script_samples", "unittest.csv"
+    ), newline="") as fh:
+        csv = reader(fh)
+        for row in csv:
+            if len(row[0]):
+                # Inject transliteration info in the test case.
+                for tname in ("sample_s2r", "sample_r2s"):
+                    tcase = TestTrans(tname)
+                    tcase.tbl = row[0]
+                    tcase.script = row[1].strip()
+                    tcase.roman = row[2].strip()
+                    tcase.options = jloads(row[3]) if len(row[3]) else {}
+                    suite.addTest(tcase)
 
     return suite
 

From 90b9f4c1f1540f7de9584224b54d78e75bcd617d Mon Sep 17 00:00:00 2001
From: scossu <stefano@cossu.cc>
Date: Sat, 16 Nov 2024 19:41:19 -0500
Subject: [PATCH 3/6] Do unit test properly.

---
 doc/rest_api.md                               |  2 +-
 scriptshifter/__init__.py                     |  2 +-
 scriptshifter/tables/__init__.py              | 16 ++--
 tests/__init__.py                             | 88 +++----------------
 tests/data/config/index.yml                   |  6 --
 tests/data/script_samples/unittest.csv        | 16 ++--
 tests/integration_tests.py                    | 58 ++++++++++++
 tests/{data/config => tables/data}/_base1.yml |  0
 tests/{data/config => tables/data}/_base2.yml |  0
 tests/{data/config => tables/data}/_base3.yml |  0
 .../config => tables/data}/cap_base1.yml      |  0
 .../config => tables/data}/cap_base2.yml      |  0
 .../config => tables/data}/cap_inherited.yml  |  0
 .../config => tables/data}/inherited.yml      |  0
 .../{data/config => tables/data}/ordering.yml |  0
 tests/{data/config => tables/data}/rot3.yml   |  0
 tests/tables/index.yml                        | 14 +++
 tests/test01_cfg.py                           | 63 +++++++------
 tests/test02_transliteration.py               | 16 ++--
 tests/test03_capitalization.py                | 17 ++--
 tests/test04_rest_api.py                      | 36 ++++----
 21 files changed, 171 insertions(+), 163 deletions(-)
 delete mode 100644 tests/data/config/index.yml
 create mode 100644 tests/integration_tests.py
 rename tests/{data/config => tables/data}/_base1.yml (100%)
 rename tests/{data/config => tables/data}/_base2.yml (100%)
 rename tests/{data/config => tables/data}/_base3.yml (100%)
 rename tests/{data/config => tables/data}/cap_base1.yml (100%)
 rename tests/{data/config => tables/data}/cap_base2.yml (100%)
 rename tests/{data/config => tables/data}/cap_inherited.yml (100%)
 rename tests/{data/config => tables/data}/inherited.yml (100%)
 rename tests/{data/config => tables/data}/ordering.yml (100%)
 rename tests/{data/config => tables/data}/rot3.yml (100%)
 create mode 100644 tests/tables/index.yml

diff --git a/doc/rest_api.md b/doc/rest_api.md
index b4712c3..94bf4bb 100644
--- a/doc/rest_api.md
+++ b/doc/rest_api.md
@@ -73,7 +73,7 @@ MIME type: `application/json`
 
 Content: JSON object with the following keys:
 
-- `lang`: Language code as given by the `/languages` endpoint. 
+- `lang`: Language code as given by the `/languages` endpoint.
 - `text`: Input text to be transliterated.
 - `capitalize`: One of `first` (capitalize the first letter of the input),
   `all` (capitalize all words separated by spaces), or null (default: apply no
diff --git a/scriptshifter/__init__.py b/scriptshifter/__init__.py
index e9a4e6f..d6adb57 100644
--- a/scriptshifter/__init__.py
+++ b/scriptshifter/__init__.py
@@ -15,7 +15,7 @@
 This DB stores all the runtime transliteration data.
 """
 DB_PATH = environ.get(
-        "DB_PATH", path.join(APP_ROOT, "data", "scriptshifter.db"))
+        "TXL_DB_PATH", path.join(APP_ROOT, "data", "scriptshifter.db"))
 
 """
 SMTP server for sending email. For a dummy server that just echoes the
diff --git a/scriptshifter/tables/__init__.py b/scriptshifter/tables/__init__.py
index 0b576ca..ee6126d 100644
--- a/scriptshifter/tables/__init__.py
+++ b/scriptshifter/tables/__init__.py
@@ -28,9 +28,6 @@
 """
 
 
-TMP_DB_PATH = path.join(
-        path.dirname(DB_PATH), "~tmp." + path.basename(DB_PATH))
-
 DEFAULT_TABLE_DIR = path.join(path.dirname(path.realpath(__file__)), "data")
 # Can be overridden for tests.
 TABLE_DIR = environ.get("TXL_CONFIG_TABLE_DIR", DEFAULT_TABLE_DIR)
@@ -152,6 +149,8 @@ def init_db():
     # Create parent diretories if necessary.
     # If the DB already exists, it will be overwritten ONLY on success at
     # this point.
+    TMP_DB_PATH = path.join(
+            path.dirname(DB_PATH), "~tmp." + path.basename(DB_PATH))
     if path.isfile(TMP_DB_PATH):
         # Remove previous temp file (possibly from failed attempt)
         unlink(TMP_DB_PATH)
@@ -176,6 +175,7 @@ def init_db():
         # If the DB already exists, it will be overwritten ONLY on success at
         # thhis point.
         move(TMP_DB_PATH, DB_PATH)
+        logger.info(f"Database initialized at {DB_PATH}.")
     finally:
         conn.close()
         if path.isfile(TMP_DB_PATH):
@@ -520,6 +520,10 @@ def get_language(lang):
             if len(s2r_hooks):
                 data["script_to_roman"]["hooks"] = s2r_hooks
 
+            double_cap = get_lang_dcap(conn, lang_id)
+            if len(double_cap):
+                data["script_to_roman"]["double_cap"] = double_cap
+
         # Roman to script map, ignore list, and hooks.
 
         if data["has_r2s"]:
@@ -541,10 +545,6 @@ def get_language(lang):
         if len(opt_data):
             data["options"] = opt_data
 
-        double_cap = get_lang_dcap(conn, lang_id)
-        if len(double_cap):
-            data["double_cap"] = double_cap
-
     conn.close()
 
     return data
@@ -652,7 +652,7 @@ def get_lang_hooks(conn, lang_id, t_dir):
             }
         )
 
-    return hooks
+    return dict(hooks)
 
 
 def get_lang_dcap(conn, lang_id):
diff --git a/tests/__init__.py b/tests/__init__.py
index 50725a7..4d43854 100644
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -1,85 +1,17 @@
-from csv import reader
-from difflib import ndiff
-from glob import glob
 from importlib import reload
-from json import loads as jloads
-from logging import getLogger
-from os import environ, path
+from os import path, environ
+from tempfile import gettempdir
 
-from scriptshifter.trans import transliterate
+import scriptshifter
+from scriptshifter import tables
 
 
 TEST_DIR = path.dirname(path.realpath(__file__))
 TEST_DATA_DIR = path.join(TEST_DIR, "data")
+TEST_CONFIG_DIR = path.join(TEST_DIR, "tables", "data")
 
-logger = getLogger(__name__)
-
-
-def reload_tables():
-    if "TXL_CONFIG_TABLE_DIR" in environ:
-        del environ["TXL_CONFIG_TABLE_DIR"]
-
-    # import here to set modified test config dir.
-    from scriptshifter import tables
-
-    tables.init_db()
-
-    for fname in glob(path.join(TEST_DATA_DIR, "config", ".yml")):
-        tname = path.splitext(path.basename(filename))[1]
-        with tables.get_connection() as conn:
-            tables.populate_table(conn, tname, {"name": fname})
-
-
-    tables.list_tables.cache_clear()
-    tables.get_language.cache_clear()
-    tables.get_lang_map.cache_clear()
-
-    return tables
-
-
-def test_sample(dset):
-    """
-    Test an individual sample set and produce a human-readable report.
-
-    Used outside of automated tests.
-
-    @param dset (str): sample set name (without the .csv extension) found in
-    the `data/script_samples` directory.
-    """
-    deltas = []
-    dset_fpath = path.join(TEST_DATA_DIR, "script_samples", dset + ".csv")
-    log_fpath = path.join(TEST_DATA_DIR, f"test_{dset}.log")
-
-    with open(dset_fpath, newline="") as fh:
-        csv = reader(fh)
-        i = 1
-        for row in csv:
-            logger.info(f"CSV row #{i}")
-            i += 1
-            lang, script, rom = row[:3]
-            if not lang:
-                continue
-            opts = jloads(row[3]) if len(row) > 3 and row[3] else {}
-            trans, warnings = transliterate(
-                    script, lang, t_dir="s2r",
-                    capitalize=opts.get("capitalize"), options=opts)
-            if (trans == rom):
-                print(".", end="")
-            else:
-                print("F", end="")
-                deltas.append((lang, script, ndiff([trans], [rom])))
-
-    with open(log_fpath, "w") as fh:
-        # If no deltas, just truncate the file.
-        for lang, script, delta in deltas:
-            fh.write(f"Language: {lang}\n")
-            fh.write(f"Original: {script}\nDiff (result vs. expected):\n")
-            for dline in delta:
-                fh.write(dline.strip() + "\n")
-            fh.write("\n\n")
-
-    ct = len(deltas)
-    if ct > 0:
-        print(f"{ct} failed tests. See report at {log_fpath}")
-    else:
-        print("All tests passed.")
+# Reload main SS modules after changing environment variables.
+environ["TXL_DB_PATH"] = path.join(gettempdir(), "scriptshifter_unittest.db")
+reload(scriptshifter)
+environ["TXL_CONFIG_TABLE_DIR"] = TEST_CONFIG_DIR
+reload(tables)
diff --git a/tests/data/config/index.yml b/tests/data/config/index.yml
deleted file mode 100644
index 489e169..0000000
--- a/tests/data/config/index.yml
+++ /dev/null
@@ -1,6 +0,0 @@
-inherited:
-  name: Test inherited table
-ordering:
-  name: Test ordering
-rot3:
-  name: Test ROT3 hooks
diff --git a/tests/data/script_samples/unittest.csv b/tests/data/script_samples/unittest.csv
index 432d293..0113c11 100644
--- a/tests/data/script_samples/unittest.csv
+++ b/tests/data/script_samples/unittest.csv
@@ -1,9 +1,7 @@
-chinese,從易經解維摩詰經，臺北市大塊文化出版股份有限公司。,"cong yi jing jie wei mo jie jing, Taibei Shi da kuai wen hua chu ban gu fen you xian gong si.",,
-chinese,廖忠俊. 著名狀元榜眼探花傳略,liao zhong jun. zhu ming zhuang yuan bang yan tan hua zhuan lüe,,
-chinese,文學革命論 / 陳獨秀 -- 人的文學 / 周作人 -- 新文學運動的意義 / 張我軍.,wen xue ge ming lun / chen du xiu -- ren de wen xue / zhou zuo ren -- xin wen xue yun dong de yi yi / zhang wo jun.,,
-belarusian,Пётр Клімук : жыццё і подзвіг касманаўта,Pi︠o︡tr Klimuk : z︠h︡ytstsi︠o︡ i podzvih kasmanaŭta,,
-greek_classical,Ἡσιόδου τοῦ Ἀσκραίου Ἔργα καὶ ἡμέραι,Hēsiodou tou Askraiou Erga kai hēmerai,,
-korean_names,간규찬,Kan Kyu-ch'an,,Hangul; from Y. Lee,
-korean_names,강감찬,Kang Kam-ch'an,,Hangul; from Y. Lee,
-korean_nonames,내 나름 대로 의 사랑,Nae narŭm taero ŭi sarang,"{""capitalize"": ""first""}",From K-Romanizer,
-korean_nonames,내 마음 속 의 한국 문학,Nae maŭm sok ŭi Han'guk munhak,"{""capitalize"": ""first""}",From K-Romanizer,
+inherited,abcd,ABCD,,
+inherited,ABCD,abcd,"{""dir"": ""r2s""}",
+inherited,ab,90,,
+rot3,abcd,defg,,
+rot3,HIJK,KLMN,,
+rot3,pqrs,Pqrs,"{""capitalize"": ""first""}",
+rot3,pqrs,PQRS,"{""capitalize"": ""all""}",
diff --git a/tests/integration_tests.py b/tests/integration_tests.py
new file mode 100644
index 0000000..e079664
--- /dev/null
+++ b/tests/integration_tests.py
@@ -0,0 +1,58 @@
+from csv import reader
+from difflib import ndiff
+from json import loads as jloads
+from logging import getLogger
+from os import path
+
+from scriptshifter.trans import transliterate
+from tests import TEST_DATA_DIR
+
+logger = getLogger(__name__)
+
+
+def test_sample(dset):
+    """
+    Test an individual sample set and produce a human-readable report.
+
+    Used outside of automated tests.
+
+    @param dset (str): sample set name (without the .csv extension) found in
+    the `data/script_samples` directory.
+    """
+    deltas = []
+    dset_fpath = path.join(TEST_DATA_DIR, "script_samples", dset + ".csv")
+    log_fpath = path.join(TEST_DATA_DIR, f"test_{dset}.log")
+
+    with open(dset_fpath, newline="") as fh:
+        csv = reader(fh)
+        i = 1
+        for row in csv:
+            logger.info(f"CSV row #{i}")
+            i += 1
+            lang, script, rom = row[:3]
+            if not lang:
+                continue
+            opts = jloads(row[3]) if len(row) > 3 and row[3] else {}
+            trans, warnings = transliterate(
+                    script, lang, t_dir="s2r",
+                    capitalize=opts.get("capitalize"), options=opts)
+            if (trans == rom):
+                print(".", end="")
+            else:
+                print("F", end="")
+                deltas.append((lang, script, ndiff([trans], [rom])))
+
+    with open(log_fpath, "w") as fh:
+        # If no deltas, just truncate the file.
+        for lang, script, delta in deltas:
+            fh.write(f"Language: {lang}\n")
+            fh.write(f"Original: {script}\nDiff (result vs. expected):\n")
+            for dline in delta:
+                fh.write(dline.strip() + "\n")
+            fh.write("\n\n")
+
+    ct = len(deltas)
+    if ct > 0:
+        print(f"{ct} failed tests. See report at {log_fpath}")
+    else:
+        print("All tests passed.")
diff --git a/tests/data/config/_base1.yml b/tests/tables/data/_base1.yml
similarity index 100%
rename from tests/data/config/_base1.yml
rename to tests/tables/data/_base1.yml
diff --git a/tests/data/config/_base2.yml b/tests/tables/data/_base2.yml
similarity index 100%
rename from tests/data/config/_base2.yml
rename to tests/tables/data/_base2.yml
diff --git a/tests/data/config/_base3.yml b/tests/tables/data/_base3.yml
similarity index 100%
rename from tests/data/config/_base3.yml
rename to tests/tables/data/_base3.yml
diff --git a/tests/data/config/cap_base1.yml b/tests/tables/data/cap_base1.yml
similarity index 100%
rename from tests/data/config/cap_base1.yml
rename to tests/tables/data/cap_base1.yml
diff --git a/tests/data/config/cap_base2.yml b/tests/tables/data/cap_base2.yml
similarity index 100%
rename from tests/data/config/cap_base2.yml
rename to tests/tables/data/cap_base2.yml
diff --git a/tests/data/config/cap_inherited.yml b/tests/tables/data/cap_inherited.yml
similarity index 100%
rename from tests/data/config/cap_inherited.yml
rename to tests/tables/data/cap_inherited.yml
diff --git a/tests/data/config/inherited.yml b/tests/tables/data/inherited.yml
similarity index 100%
rename from tests/data/config/inherited.yml
rename to tests/tables/data/inherited.yml
diff --git a/tests/data/config/ordering.yml b/tests/tables/data/ordering.yml
similarity index 100%
rename from tests/data/config/ordering.yml
rename to tests/tables/data/ordering.yml
diff --git a/tests/data/config/rot3.yml b/tests/tables/data/rot3.yml
similarity index 100%
rename from tests/data/config/rot3.yml
rename to tests/tables/data/rot3.yml
diff --git a/tests/tables/index.yml b/tests/tables/index.yml
new file mode 100644
index 0000000..c45f9eb
--- /dev/null
+++ b/tests/tables/index.yml
@@ -0,0 +1,14 @@
+inherited:
+  name: Test inheritance leaf file
+  marc_code: inh
+  description: Test description.
+cap_base1:
+  name: Test capitalization base 1
+cap_base2:
+  name: Test capitalization base 2
+cap_inherited:
+  name: Test capitalization
+ordering:
+  name: Test ordering
+rot3:
+  name: Test ROT3 hooks
diff --git a/tests/test01_cfg.py b/tests/test01_cfg.py
index c861d91..6b7466f 100644
--- a/tests/test01_cfg.py
+++ b/tests/test01_cfg.py
@@ -1,20 +1,22 @@
+from os import environ, unlink
 from unittest import TestCase
 
-from os import environ
+from scriptshifter.tables import get_language
 
-import scriptshifter
 
-from tests import TEST_DATA_DIR, reload_tables
+def setUpModule():
+    from scriptshifter.tables import init_db
+    init_db()
+
+
+def tearDownModule():
+    unlink(environ["TXL_DB_PATH"])
 
 
 class TestConfig(TestCase):
     """ Test configuration parsing. """
-    def setUp(self):
-        environ["TXL_CONFIG_TABLE_DIR"] = TEST_DATA_DIR
-        self.tables = reload_tables()
-
     def test_ordering(self):
-        tbl = self.tables.load_table("ordering")
+        tbl = get_language("ordering")
         exp_order = ["ABCD", "AB", "A", "BCDE", "BCD", "BEFGH", "B"]
 
         self.assertEqual(
@@ -23,19 +25,17 @@ def test_ordering(self):
 
 class TestOverride(TestCase):
     """ Test configuration overrides. """
-    def setUp(self):
-        environ["TXL_CONFIG_TABLE_DIR"] = TEST_DATA_DIR
-        self.tables = reload_tables()
-
     def test_override_map(self):
-        tbl = self.tables.load_table("inherited")
+        tbl = get_language("inherited")
 
-        self.assertEqual(tbl["general"]["name"], "Test inheritance leaf file")
+        self.assertEqual(tbl["label"], "Test inheritance leaf file")
+        self.assertEqual(tbl["marc_code"], "inh")
+        self.assertEqual(tbl["description"], "Test description.")
 
         # Entries are additive.
         self.assertEqual(
                 tbl["roman_to_script"]["ignore"],
-                ["Fritter my wig", "Hi", "Ho", "Thing-um-a-jig"])
+                ("Fritter my wig", "Hi", "Ho", "Thing-um-a-jig"))
         self.assertEqual(
                 tbl["roman_to_script"]["map"],
                 (
@@ -102,34 +102,31 @@ def test_override_map(self):
 
 class TestHooks(TestCase):
     """ Test parsing of hook functions. """
-    def setUp(self):
-        environ["TXL_CONFIG_TABLE_DIR"] = TEST_DATA_DIR
-        self.tables = reload_tables()
-
     def test_rot3(self):
-        tbl = self.tables.load_table("rot3")
+        tbl = get_language("rot3")
 
         self.assertEqual(
-                tbl["script_to_roman"]["hooks"],
-                {
-                    "begin_input_token": [
-                        ("test", scriptshifter.hooks.test.rotate, {"n": -3})
-                    ]
-                })
+            tbl["script_to_roman"]["hooks"],
+            {
+                "begin_input_token": [
+                    {
+                        "module_name": "test",
+                        "fn_name": "rotate",
+                        "kwargs": {"n": -3},
+                    }
+                ]
+            }
+        )
 
 
 class TestDoubleCaps(TestCase):
     """ Test double capitalization configuration. """
-    def setUp(self):
-        environ["TXL_CONFIG_TABLE_DIR"] = TEST_DATA_DIR
-        self.tables = reload_tables()
-
     def test_dcaps_base1(self):
-        cap_base1 = self.tables.load_table("cap_base1")
+        cap_base1 = get_language("cap_base1")
         assert "z︠h︡" in cap_base1["script_to_roman"]["double_cap"]
 
     def test_dcaps_base2(self):
-        cap_base2 = self.tables.load_table("cap_base2")
+        cap_base2 = get_language("cap_base2")
         dcap = cap_base2["script_to_roman"]["double_cap"]
 
         assert len(dcap) == 2
@@ -137,7 +134,7 @@ def test_dcaps_base2(self):
         assert "i︠o︡" in dcap
 
     def test_dcaps_inherited(self):
-        cap_inherited = self.tables.load_table("cap_inherited")
+        cap_inherited = get_language("cap_inherited")
         dcap = cap_inherited["script_to_roman"]["double_cap"]
 
         assert len(dcap) == 1
diff --git a/tests/test02_transliteration.py b/tests/test02_transliteration.py
index 03c4744..eda188a 100644
--- a/tests/test02_transliteration.py
+++ b/tests/test02_transliteration.py
@@ -2,18 +2,26 @@
 
 from unittest import TestCase, TestSuite, TextTestRunner
 from csv import reader
-from glob import glob
 from json import loads as jloads
-from os import environ, path
+from os import environ, path, unlink
 
-from tests import TEST_DATA_DIR, reload_tables
 from scriptshifter.trans import transliterate
 from scriptshifter.tables import get_language
+from tests import TEST_DATA_DIR
 
 
 logger = logging.getLogger(__name__)
 
 
+def setUpModule():
+    from scriptshifter.tables import init_db
+    init_db()
+
+
+def tearDownModule():
+    unlink(environ["TXL_DB_PATH"])
+
+
 class TestTrans(TestCase):
     """
     Test S2R transliteration.
@@ -68,8 +76,6 @@ def make_suite():
     """
     Build parametrized test cases.
     """
-    reload_tables()
-
     suite = TestSuite()
 
     with open(path.join(
diff --git a/tests/test03_capitalization.py b/tests/test03_capitalization.py
index 085cdf4..06acecd 100644
--- a/tests/test03_capitalization.py
+++ b/tests/test03_capitalization.py
@@ -1,19 +1,22 @@
-from os import environ
+from os import environ, unlink
 from unittest import TestCase
 
 from scriptshifter.trans import transliterate
-from tests import TEST_DATA_DIR, reload_tables
+
+
+def setUpModule():
+    from scriptshifter.tables import init_db
+    init_db()
+
+
+def tearDownModule():
+    unlink(environ["TXL_DB_PATH"])
 
 
 class TestCapitalization(TestCase):
     """
     Test capitalization.
     """
-
-    def setUp(self):
-        environ["TXL_CONFIG_TABLE_DIR"] = TEST_DATA_DIR
-        self.tables = reload_tables()
-
     def test_cap(self):
         tbl = "cap_inherited"
         in_str = "зг іо"
diff --git a/tests/test04_rest_api.py b/tests/test04_rest_api.py
index 1bccd58..1f54b0e 100644
--- a/tests/test04_rest_api.py
+++ b/tests/test04_rest_api.py
@@ -1,25 +1,28 @@
 import json
 
-from os import environ
+from os import environ, unlink
 from unittest import TestCase
 
 from scriptshifter.rest_api import app
-from tests import TEST_DATA_DIR, reload_tables
 
 
 EP = "http://localhost:8000"
 
 
+def setUpModule():
+    from scriptshifter.tables import init_db
+    init_db()
+
+
+def tearDownModule():
+    unlink(environ["TXL_DB_PATH"])
+
+
 class TestRestAPI(TestCase):
     """ Test REST API interaction. """
-    def setUp(self):
-        environ["TXL_CONFIG_TABLE_DIR"] = TEST_DATA_DIR
-        # if "TXL_CONFIG_TABLE_DIR" in environ:
-        #     del environ["TXL_CONFIG_TABLE_DIR"]
-        reload_tables()
-
-        # Start webapp.
-        app.testing = True
+    # def setUp(self):
+    #     # Start webapp.
+    #     app.testing = True
 
     def test_health(self):
         with app.test_client() as c:
@@ -35,7 +38,7 @@ def test_language_list(self):
 
         data = json.loads(rsp.get_data(as_text=True))
         self.assertIn("inherited", data)
-        self.assertIn("name", data["inherited"])
+        self.assertIn("label", data["inherited"])
         self.assertNotIn("_base1", data)
         self.assertNotIn("_base2", data)
         self.assertNotIn("_base3", data)
@@ -47,14 +50,17 @@ def test_lang_table(self):
         self.assertEqual(rsp.status_code, 200)
         data = json.loads(rsp.get_data(as_text=True))
 
-        self.assertIn("general", data)
+        self.assertIn("case_sensitive", data)
+        self.assertIn("description", data)
         self.assertIn("roman_to_script", data)
         self.assertIn("map", data["roman_to_script"])
+        self.assertEqual(data["has_r2s"], True)
+        self.assertEqual(data["has_s2r"], False)
         self.assertEqual(data["roman_to_script"]["map"][0], ["ABCD", ""])
 
     def test_trans_api_s2r(self):
         with app.test_client() as c:
-            rsp = c.post("/trans", data={"lang": "rot3", "text": "defg"})
+            rsp = c.post("/trans", json={"lang": "rot3", "text": "defg"})
 
         self.assertEqual(rsp.status_code, 200)
         data = json.loads(rsp.get_data(as_text=True))
@@ -64,7 +70,7 @@ def test_trans_api_s2r(self):
     def test_trans_api_r2s(self):
         with app.test_client() as c:
             rsp = c.post(
-                "/trans", data={
+                "/trans", json={
                     "lang": "rot3",
                     "text": "abcd",
                     "t_dir": "r2s"
@@ -80,7 +86,7 @@ def test_trans_api_capitalize(self):
         with app.test_client() as c:
             rsp = c.post(
                 "/trans",
-                data={
+                json={
                     "lang": "rot3",
                     "capitalize": "first",
                     "text": "bcde",

From 7e0722d359fb479f28450f1615cb2c51cccb023e Mon Sep 17 00:00:00 2001
From: scossu <stefano@cossu.cc>
Date: Sun, 17 Nov 2024 19:53:21 -0500
Subject: [PATCH 4/6] Pass minimum test set.

---
 tests/data/script_samples/unittest.csv | 13 ++++----
 tests/test01_cfg.py                    |  3 +-
 tests/test02_transliteration.py        | 44 +++++++-------------------
 tests/test03_capitalization.py         |  2 +-
 tests/test04_rest_api.py               |  2 +-
 5 files changed, 20 insertions(+), 44 deletions(-)

diff --git a/tests/data/script_samples/unittest.csv b/tests/data/script_samples/unittest.csv
index 0113c11..eb4a1f6 100644
--- a/tests/data/script_samples/unittest.csv
+++ b/tests/data/script_samples/unittest.csv
@@ -1,7 +1,6 @@
-inherited,abcd,ABCD,,
-inherited,ABCD,abcd,"{""dir"": ""r2s""}",
-inherited,ab,90,,
-rot3,abcd,defg,,
-rot3,HIJK,KLMN,,
-rot3,pqrs,Pqrs,"{""capitalize"": ""first""}",
-rot3,pqrs,PQRS,"{""capitalize"": ""all""}",
+inherited,abcd,9078,,
+inherited,TUVX,tuvx,"{""t_dir"": ""r2s""}",
+rot3,defg,abcd,,
+rot3,HIJK,KLMN,"{""t_dir"": ""r2s""}",
+rot3,st uv,Vw xy,"{""t_dir"": ""r2s"", ""capitalize"": ""first""}",
+rot3,st uv,Vw Xy,"{""t_dir"": ""r2s"", ""capitalize"": ""all""}",
diff --git a/tests/test01_cfg.py b/tests/test01_cfg.py
index 6b7466f..8acd127 100644
--- a/tests/test01_cfg.py
+++ b/tests/test01_cfg.py
@@ -1,11 +1,10 @@
 from os import environ, unlink
 from unittest import TestCase
 
-from scriptshifter.tables import get_language
+from scriptshifter.tables import get_language, init_db
 
 
 def setUpModule():
-    from scriptshifter.tables import init_db
     init_db()
 
 
diff --git a/tests/test02_transliteration.py b/tests/test02_transliteration.py
index eda188a..3c73de8 100644
--- a/tests/test02_transliteration.py
+++ b/tests/test02_transliteration.py
@@ -6,7 +6,7 @@
 from os import environ, path, unlink
 
 from scriptshifter.trans import transliterate
-from scriptshifter.tables import get_language
+from scriptshifter.tables import get_language, init_db
 from tests import TEST_DATA_DIR
 
 
@@ -14,7 +14,6 @@
 
 
 def setUpModule():
-    from scriptshifter.tables import init_db
     init_db()
 
 
@@ -31,12 +30,9 @@ class TestTrans(TestCase):
     TODO use a comprehensive sample table and report errors for unsupported
     languages.
     """
-
-    maxDiff = None
-
-    def sample_s2r(self):
+    def sample(self):
         """
-        Test S2R transliteration for one CSV sample.
+        Test transliteration for one CSV row.
 
         This function name won't start with `test_` otherwise will be
         automatically run without parameters.
@@ -45,6 +41,7 @@ def sample_s2r(self):
         if config["has_s2r"]:
             txl = transliterate(
                     self.script, self.tbl,
+                    t_dir=self.options.get("t_dir", "s2r"),
                     capitalize=self.options.get("capitalize", False),
                     options=self.options)[0]
             self.assertEqual(
@@ -52,25 +49,6 @@ def sample_s2r(self):
                     f"S2R transliteration error for {self.tbl}!\n"
                     f"Original: {self.script}")
 
-    def sample_r2s(self):
-        """
-        Test R2S transliteration for one CSV sample.
-
-        This function name won't start with `test_` otherwise will be
-        automatically run without parameters.
-        """
-        config = get_language(self.tbl)
-        if config["has_r2s"]:
-            txl = transliterate(
-                    self.roman, self.tbl,
-                    t_dir="r2s",
-                    capitalize=self.options.get("capitalize", False),
-                    options=self.options)[0]
-            self.assertEqual(
-                    txl, self.script,
-                    f"R2S transliteration error for {self.tbl}!\n"
-                    f"Original: {self.roman}")
-
 
 def make_suite():
     """
@@ -85,13 +63,13 @@ def make_suite():
         for row in csv:
             if len(row[0]):
                 # Inject transliteration info in the test case.
-                for tname in ("sample_s2r", "sample_r2s"):
-                    tcase = TestTrans(tname)
-                    tcase.tbl = row[0]
-                    tcase.script = row[1].strip()
-                    tcase.roman = row[2].strip()
-                    tcase.options = jloads(row[3]) if len(row[3]) else {}
-                    suite.addTest(tcase)
+                tcase = TestTrans("sample")
+                tcase.tbl = row[0]
+                tcase.script = row[1].strip()
+                tcase.roman = row[2].strip()
+                tcase.options = jloads(row[3]) if len(row[3]) else {}
+
+                suite.addTest(tcase)
 
     return suite
 
diff --git a/tests/test03_capitalization.py b/tests/test03_capitalization.py
index 06acecd..eaf53b8 100644
--- a/tests/test03_capitalization.py
+++ b/tests/test03_capitalization.py
@@ -2,10 +2,10 @@
 from unittest import TestCase
 
 from scriptshifter.trans import transliterate
+from scriptshifter.tables import init_db
 
 
 def setUpModule():
-    from scriptshifter.tables import init_db
     init_db()
 
 
diff --git a/tests/test04_rest_api.py b/tests/test04_rest_api.py
index 1f54b0e..bf065f4 100644
--- a/tests/test04_rest_api.py
+++ b/tests/test04_rest_api.py
@@ -4,13 +4,13 @@
 from unittest import TestCase
 
 from scriptshifter.rest_api import app
+from scriptshifter.tables import init_db
 
 
 EP = "http://localhost:8000"
 
 
 def setUpModule():
-    from scriptshifter.tables import init_db
     init_db()
 
 

From efb27b8707b92f52e0c463b1a0cc5f1daa900ce3 Mon Sep 17 00:00:00 2001
From: scossu <stefano@cossu.cc>
Date: Mon, 25 Nov 2024 15:42:16 -0500
Subject: [PATCH 5/6] WIP Add tests for regex ignore patterns.

---
 sscli                                  |  2 +-
 tests/data/script_samples/unittest.csv |  2 ++
 tests/tables/data/regex.yml            | 11 +++++++++++
 tests/tables/index.yml                 |  3 +++
 tests/test02_transliteration.py        |  7 +++++--
 5 files changed, 22 insertions(+), 3 deletions(-)
 create mode 100644 tests/tables/data/regex.yml

diff --git a/sscli b/sscli
index 154aaf2..dca7334 100755
--- a/sscli
+++ b/sscli
@@ -10,7 +10,7 @@ from os import path
 
 from scriptshifter import DB_PATH
 from scriptshifter.tables import init_db as _init_db
-from tests import test_sample
+from tests.integration_tests import test_sample
 
 
 @click.group()
diff --git a/tests/data/script_samples/unittest.csv b/tests/data/script_samples/unittest.csv
index eb4a1f6..79e12fa 100644
--- a/tests/data/script_samples/unittest.csv
+++ b/tests/data/script_samples/unittest.csv
@@ -4,3 +4,5 @@ rot3,defg,abcd,,
 rot3,HIJK,KLMN,"{""t_dir"": ""r2s""}",
 rot3,st uv,Vw xy,"{""t_dir"": ""r2s"", ""capitalize"": ""first""}",
 rot3,st uv,Vw Xy,"{""t_dir"": ""r2s"", ""capitalize"": ""all""}",
+regex,Hello abc,Hello 907,"{""t_dir"": ""r2s""}",
+regex,Hollo abc,Hollo 907,"{""t_dir"": ""r2s""}",
diff --git a/tests/tables/data/regex.yml b/tests/tables/data/regex.yml
new file mode 100644
index 0000000..6f15123
--- /dev/null
+++ b/tests/tables/data/regex.yml
@@ -0,0 +1,11 @@
+---
+# Test file for regex ignoring.
+
+general:
+  name: Test regex ignoring.
+  parents:
+    - inherited
+
+roman_to_script:
+  ignore_ptn:
+    - "[hH][aeu]llo"
diff --git a/tests/tables/index.yml b/tests/tables/index.yml
index c45f9eb..3b4aad4 100644
--- a/tests/tables/index.yml
+++ b/tests/tables/index.yml
@@ -1,3 +1,4 @@
+---
 inherited:
   name: Test inheritance leaf file
   marc_code: inh
@@ -10,5 +11,7 @@ cap_inherited:
   name: Test capitalization
 ordering:
   name: Test ordering
+regex:
+  name: inherited config + regex ignore.
 rot3:
   name: Test ROT3 hooks
diff --git a/tests/test02_transliteration.py b/tests/test02_transliteration.py
index 3c73de8..9e3856b 100644
--- a/tests/test02_transliteration.py
+++ b/tests/test02_transliteration.py
@@ -38,10 +38,13 @@ def sample(self):
         automatically run without parameters.
         """
         config = get_language(self.tbl)
-        if config["has_s2r"]:
+        t_dir = self.options.get("t_dir", "s2r")
+        if (
+                t_dir == "s2r" and config["has_s2r"]
+                or t_dir == "r2s" and config["has_r2s"]):
             txl = transliterate(
                     self.script, self.tbl,
-                    t_dir=self.options.get("t_dir", "s2r"),
+                    t_dir=t_dir,
                     capitalize=self.options.get("capitalize", False),
                     options=self.options)[0]
             self.assertEqual(

From 91f3ad98ab1ce018d654a475c706a5250d42d12d Mon Sep 17 00:00:00 2001
From: scossu <stefano@cossu.cc>
Date: Mon, 2 Dec 2024 22:24:03 -0500
Subject: [PATCH 6/6] Complete basic tests for regex ignore.

---
 scriptshifter/trans.py                 | 30 ++++++++++++++++++++++----
 tests/data/script_samples/unittest.csv |  4 ++--
 tests/tables/data/regex.yml            | 10 ++++++++-
 3 files changed, 37 insertions(+), 7 deletions(-)

diff --git a/scriptshifter/trans.py b/scriptshifter/trans.py
index 8f5a39e..0c8a74c 100644
--- a/scriptshifter/trans.py
+++ b/scriptshifter/trans.py
@@ -1,7 +1,7 @@
 import logging
 
 from importlib import import_module
-from re import compile
+from re import Pattern, compile
 
 from scriptshifter.exceptions import BREAK, CONT
 from scriptshifter.tables import (
@@ -152,7 +152,7 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
             # token or exit the scanning loop altogether.
             hret = _run_hook("begin_input_token", ctx)
             if hret == BREAK:
-                Logger.debug("Breaking text scanning from hook signal.")
+                logger.debug("Breaking text scanning from hook signal.")
                 break
             if hret == CONT:
                 logger.debug("Skipping scanning iteration from hook signal.")
@@ -170,8 +170,21 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
                     if hret == CONT:
                         continue
 
-                    step = len(ctx.tk)
-                    if ctx.tk == ctx.src[ctx.cur:ctx.cur + step]:
+                    _matching = False
+                    if type(ctx.tk) is Pattern:
+                        # Seach RE pattern beginning at cursor.
+                        if _ptn_match := ctx.tk.match(ctx.src[ctx.cur:]):
+                            ctx.tk = _ptn_match[0]
+                            logger.debug(f"Matched regex: {ctx.tk}")
+                            step = len(ctx.tk)
+                            _matching = True
+                    else:
+                        # Search exact match.
+                        step = len(ctx.tk)
+                        if ctx.tk == ctx.src[ctx.cur:ctx.cur + step]:
+                            _matching = True
+
+                    if _matching:
                         # The position matches an ignore token.
                         hret = _run_hook("on_ignore_match", ctx)
                         if hret == BREAK:
@@ -182,6 +195,12 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
                         logger.info(f"Ignored token: {ctx.tk}")
                         ctx.dest_ls.append(ctx.tk)
                         ctx.cur += step
+                        if ctx.cur >= len(ctx.src):
+                            # reached end of string. Stop ignoring.
+                            # The outer loop will exit imediately after.
+                            ctx.ignoring = False
+                            break
+
                         cur_char = ctx.src[ctx.cur]
                         ctx.ignoring = True
                         break
@@ -194,6 +213,9 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
             delattr(ctx, "tk")
             delattr(ctx, "ignoring")
 
+            if ctx.cur >= len(ctx.src):
+                break
+
             # Begin transliteration token lookup.
             ctx.match = False
 
diff --git a/tests/data/script_samples/unittest.csv b/tests/data/script_samples/unittest.csv
index 79e12fa..fda09ce 100644
--- a/tests/data/script_samples/unittest.csv
+++ b/tests/data/script_samples/unittest.csv
@@ -4,5 +4,5 @@ rot3,defg,abcd,,
 rot3,HIJK,KLMN,"{""t_dir"": ""r2s""}",
 rot3,st uv,Vw xy,"{""t_dir"": ""r2s"", ""capitalize"": ""first""}",
 rot3,st uv,Vw Xy,"{""t_dir"": ""r2s"", ""capitalize"": ""all""}",
-regex,Hello abc,Hello 907,"{""t_dir"": ""r2s""}",
-regex,Hollo abc,Hollo 907,"{""t_dir"": ""r2s""}",
+regex,Hello abc,Hello 678,"{""t_dir"": ""r2s""}",
+regex,Hullo abc,5u22o 678,"{""t_dir"": ""r2s""}",
diff --git a/tests/tables/data/regex.yml b/tests/tables/data/regex.yml
index 6f15123..cf18a09 100644
--- a/tests/tables/data/regex.yml
+++ b/tests/tables/data/regex.yml
@@ -8,4 +8,12 @@ general:
 
 roman_to_script:
   ignore_ptn:
-    - "[hH][aeu]llo"
+    - "[hH][ae]llo"
+
+  map:
+    "h": "1"
+    "H": "5"
+    "l": "2"
+    "a": "6"
+    "b": "7"
+    "c": "8"