Skip to content

Commit

Permalink
Don't use K form normalization
Browse files Browse the repository at this point in the history
  • Loading branch information
kg583 committed May 28, 2024
1 parent 4b11848 commit 327367e
Showing 1 changed file with 3 additions and 2 deletions.
5 changes: 3 additions & 2 deletions tivars/tokenizer/encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
"""


import re
import unicodedata

from tivars.models import *
Expand Down Expand Up @@ -83,13 +84,13 @@ def encode(string: str, *,

def normalize(string: str):
"""
Applies NFKC normalization to a given string to ensure recognition of certain Unicode characters used as token names
Applies NFC normalization to a given string to ensure recognition of certain Unicode characters used as token names
:param string: The text to normalize
:return: The text in ``string`` normalized
"""

return unicodedata.normalize("NFKC", string).replace(", "θ")
return re.sub("[\u0398\u03F4\u1DBF]", ", unicodedata.normalize("NFC", string))


# Yucky scope nonsense to avoid a globals() call
Expand Down

0 comments on commit 327367e

Please sign in to comment.