Don't use K form normalization

TI-Toolkit · May 28, 2024 · 327367e · 327367e
1 parent 4b11848
commit 327367e
Showing 1 changed file with 3 additions and 2 deletions.
diff --git a/tivars/tokenizer/encoder.py b/tivars/tokenizer/encoder.py
@@ -3,6 +3,7 @@
 """
 
 
+import re
 import unicodedata
 
 from tivars.models import *
@@ -83,13 +84,13 @@ def encode(string: str, *,
 
 def normalize(string: str):
     """
-    Applies NFKC normalization to a given string to ensure recognition of certain Unicode characters used as token names
+    Applies NFC normalization to a given string to ensure recognition of certain Unicode characters used as token names
 
     :param string: The text to normalize
     :return: The text in ``string`` normalized
     """
 
-    return unicodedata.normalize("NFKC", string).replace("Θ", "θ")
+    return re.sub("[\u0398\u03F4\u1DBF]", "θ", unicodedata.normalize("NFC", string))
 
 
 # Yucky scope nonsense to avoid a globals() call