Skip to content

Commit

Permalink
Merge pull request #141 from lcnetdev/thai_word_splitting
Browse files Browse the repository at this point in the history
Thai word splitting
  • Loading branch information
scossu authored Oct 22, 2024
2 parents e2f0d2b + 237f1f8 commit ac29135
Show file tree
Hide file tree
Showing 5 changed files with 27 additions and 5 deletions.
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Core application dependencies.
aksharamukha>=2.2,<3
esupar>=1.7.5
flask>=2.3,<3
flask-cors>=4.0,<5
python-dotenv>=1.0,<2
Expand Down
8 changes: 8 additions & 0 deletions scriptshifter/hooks/asian_tokenizer/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
from esupar import load


def s2r_tokenize(ctx, model):
nlp = load(model)
token_data = nlp(ctx.src)

ctx._src = " ".join(token_data.values[1])
3 changes: 3 additions & 0 deletions scriptshifter/tables/data/thai.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,9 @@ options:
script_to_roman:
hooks:
post_config:
-
- asian_tokenizer.s2r_tokenize
- model: "KoichiYasuoka/roberta-base-thai-spm-upos"
-
- aksharamukha.romanizer.s2r_post_config
- src_script: "Thai"
Expand Down
5 changes: 5 additions & 0 deletions scriptshifter/tables/data/thai_alt.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,11 @@ general:
case_sensitive: false

script_to_roman:
hooks:
post_normalize:
-
- asian_tokenizer.s2r_tokenize
- model: "th"
map:
# COMMON SPECIAL CHARACTERS

Expand Down
15 changes: 10 additions & 5 deletions scriptshifter/trans.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,11 +120,12 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
if _run_hook("post_config", ctx) == BREAK:
return getattr(ctx, "dest", ""), ctx.warnings

_normalize_src(ctx, get_lang_normalize(ctx.conn, ctx.lang_id))

if _run_hook("post_normalize", ctx) == BREAK:
# _normalize_src returns the results of the post_normalize hook.
if _normalize_src(
ctx, get_lang_normalize(ctx.conn, ctx.lang_id)) == BREAK:
return getattr(ctx, "dest", ""), ctx.warnings

logger.debug(f"Normalized source: {ctx.src}")
lang_map = list(get_lang_map(ctx.conn, ctx.lang_id, ctx.t_dir))

# Loop through source characters. The increment of each loop depends on
Expand All @@ -151,7 +152,7 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
# token or exit the scanning loop altogether.
hret = _run_hook("begin_input_token", ctx)
if hret == BREAK:
logger.debug("Breaking text scanning from hook signal.")
Logger.debug("Breaking text scanning from hook signal.")
break
if hret == CONT:
logger.debug("Skipping scanning iteration from hook signal.")
Expand Down Expand Up @@ -315,10 +316,14 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
def _normalize_src(ctx, norm_rules):
"""
Normalize source text according to rules.
NOTE: this manipluates the protected source attribute so it may not
correspond to the originally provided source.
"""
for nk, nv in norm_rules.items():
ctx._src = ctx.src.replace(nk, nv)
logger.debug(f"Normalized source: {ctx.src}")

return _run_hook("post_normalize", ctx)


def _is_bow(cur, ctx, word_boundary):
Expand Down

0 comments on commit ac29135

Please sign in to comment.