Merge pull request #141 from lcnetdev/thai_word_splitting

Thai word splitting
lcnetdev · Oct 22, 2024 · ac29135 · ac29135
2 parents e2f0d2b + 237f1f8
commit ac29135
Show file tree

Hide file tree

Showing 5 changed files with 27 additions and 5 deletions.
diff --git a/requirements.txt b/requirements.txt
@@ -1,5 +1,6 @@
 # Core application dependencies.
 aksharamukha>=2.2,<3
+esupar>=1.7.5
 flask>=2.3,<3
 flask-cors>=4.0,<5
 python-dotenv>=1.0,<2

diff --git a/scriptshifter/hooks/asian_tokenizer/__init__.py b/scriptshifter/hooks/asian_tokenizer/__init__.py
@@ -0,0 +1,8 @@
+from esupar import load
+
+
+def s2r_tokenize(ctx, model):
+    nlp = load(model)
+    token_data = nlp(ctx.src)
+
+    ctx._src = " ".join(token_data.values[1])
diff --git a/scriptshifter/tables/data/thai.yml b/scriptshifter/tables/data/thai.yml
@@ -33,6 +33,9 @@ options:
 script_to_roman:
   hooks:
     post_config:
+      -
+        - asian_tokenizer.s2r_tokenize
+        - model: "KoichiYasuoka/roberta-base-thai-spm-upos"
       -
         - aksharamukha.romanizer.s2r_post_config
         - src_script: "Thai"

diff --git a/scriptshifter/tables/data/thai_alt.yml b/scriptshifter/tables/data/thai_alt.yml
@@ -4,6 +4,11 @@ general:
   case_sensitive: false
 
 script_to_roman:
+  hooks:
+    post_normalize:
+      -
+        - asian_tokenizer.s2r_tokenize
+        - model: "th"
   map:
     # COMMON SPECIAL CHARACTERS
 

diff --git a/scriptshifter/trans.py b/scriptshifter/trans.py
@@ -120,11 +120,12 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
         if _run_hook("post_config", ctx) == BREAK:
             return getattr(ctx, "dest", ""), ctx.warnings
 
-        _normalize_src(ctx, get_lang_normalize(ctx.conn, ctx.lang_id))
-
-        if _run_hook("post_normalize", ctx) == BREAK:
+        # _normalize_src returns the results of the post_normalize hook.
+        if _normalize_src(
+                ctx, get_lang_normalize(ctx.conn, ctx.lang_id)) == BREAK:
             return getattr(ctx, "dest", ""), ctx.warnings
 
+        logger.debug(f"Normalized source: {ctx.src}")
         lang_map = list(get_lang_map(ctx.conn, ctx.lang_id, ctx.t_dir))
 
         # Loop through source characters. The increment of each loop depends on
@@ -151,7 +152,7 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
             # token or exit the scanning loop altogether.
             hret = _run_hook("begin_input_token", ctx)
             if hret == BREAK:
-                logger.debug("Breaking text scanning from hook signal.")
+                Logger.debug("Breaking text scanning from hook signal.")
                 break
             if hret == CONT:
                 logger.debug("Skipping scanning iteration from hook signal.")
@@ -315,10 +316,14 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
 def _normalize_src(ctx, norm_rules):
     """
     Normalize source text according to rules.
+
+    NOTE: this manipluates the protected source attribute so it may not
+    correspond to the originally provided source.
     """
     for nk, nv in norm_rules.items():
         ctx._src = ctx.src.replace(nk, nv)
-    logger.debug(f"Normalized source: {ctx.src}")
+
+    return _run_hook("post_normalize", ctx)
 
 
 def _is_bow(cur, ctx, word_boundary):