Khmer Text Normalization and Verbalization Toolkit.
pip install tha
import tha.normalize
import tha.phone_numbers
import tha.urls
import tha.datetime
import tha.hashtags
import tha.ascii_lines
import tha.license_plate
import tha.cardinals
import tha.decimals
import tha.ordinals
import tha.currency
import tha.parenthesis
import tha.repeater
## Normalize
assert tha.normalize.processor("αα·α\u200bα²αα") == "αα·αα±αα"
## Phone Numbers
assert tha.phone_numbers.processor("010123123", chunk_size=2) == "0β10β12β31β23"
assert tha.phone_numbers.processor("010123123", chunk_size=3) == "0β10β123β123"
assert tha.phone_numbers.processor("0961231234", chunk_size=3) == "0β96β123β1234"
## URLs and emails
assert tha.urls.processor("[email protected]") == "example at gβmail dot com"
assert tha.urls.processor("https://google.com") == "google dot com"
assert tha.urls.processor("http://google.com") == "google dot com"
assert tha.urls.processor("google.com") == "google dot com"
assert tha.urls.processor("google.gov.kh") == "google dot gov dot kβh"
assert tha.urls.processor("google.com.kh") == "google dot com dot kβh"
## Time
assert tha.datetime.time_processor("10:23AM") == "10 23βAβM"
assert tha.datetime.time_processor("10:23PM") == "10 23βPβM"
assert tha.datetime.time_processor("1:23PM") == "1 23βPβM"
## Date
assert tha.datetime.date_processor("2024-01-02") == "2024 01 02"
assert tha.datetime.date_processor("01-02-2034") == "01 02 2034"
## Hashtags
assert (
tha.hashtags.processor("Hello world #this_will_remove hello") == "Hello world hello"
)
assert tha.hashtags.processor("Hello world #αα»α hello") == "Hello world hello"
assert tha.hashtags.processor("Hello world #αα»α1234 hello") == "Hello world hello"
## ASCII Lines
assert tha.ascii_lines.processor("Remove --- asdasd") == "Remove asdasd"
assert tha.ascii_lines.processor("Remove\n###\nasdasd") == "Remove\n\nasdasd"
## Cambodia License Plate
assert tha.license_plate.processor("1A 1234") == "1 A 12β34"
assert tha.license_plate.processor("1A 4444") == "1 A ααΆααα4"
## Number - Cardinals
assert tha.cardinals.processor("1234") == "αα½αααΆααβααΈαααβααΆααα·ααα½α"
assert tha.cardinals.processor("1") == "αα½α"
assert tha.cardinals.processor("1β2") == "αα½αβααΈα"
assert tha.cardinals.processor("-1") == "ααβαα½α"
assert tha.cardinals.processor("10") == "ααα"
assert tha.cardinals.processor("15") == "αααααααΆα"
assert tha.cardinals.processor("100") == "αα½ααα"
assert tha.cardinals.processor("10000") == "αα½ααααΊα"
assert tha.cardinals.processor("10000.234") == "αα½ααααΊα.ααΈαααβααΆααα·ααα½α"
assert tha.cardinals.processor("-10000.234") == "ααβαα½ααααΊα.ααΈαααβααΆααα·ααα½α"
assert tha.cardinals.processor("-10000,234") == "ααβαα½ααααΊα,ααΈαααβααΆααα·ααα½α"
## Number - Decimals
assert tha.decimals.processor("123.324") == "αα½αααβααααααΈβα
α»α
βααΈααβαααααα½α"
assert tha.decimals.processor("123.001") == "αα½αααβααααααΈβα
α»α
βααΌαααβααΌαααβαα½α"
assert tha.decimals.processor("-123.0012") == "ααβαα½αααβααααααΈβα
α»α
βααΌαααβααΌαααβαααααΈα"
assert tha.decimals.processor("-123,0012") == "ααβαα½αααβααααααΈβαααααβααΌαααβααΌαααβαααααΈα"
## Number - Ordinals
assert tha.ordinals.processor("5th") == "ααΈβααααΆα"
assert tha.ordinals.processor("3rd") == "ααΈβααΈ"
assert tha.ordinals.processor("1st") == "ααΈβαα½α"
assert tha.ordinals.processor("10th") == "ααΈβααα"
assert tha.ordinals.processor("10") == "10"
## Number - Currency
assert tha.currency.processor("$100.01") == "αα½ααααα»ααααΆαβαα½αααα"
assert tha.currency.processor("$100") == "αα½αααβαα»ααααΆα"
assert tha.currency.processor("100$") == "αα½ααααα»ααααΆα"
assert tha.currency.processor("100α") == "αα½αααααα"
assert tha.currency.processor("100.32α") == "αα½αααβα
α»α
βααΆααα·αααΈαααα"
assert tha.currency.processor("100.0032α") == "αα½αααβα
α»α
βααΌαααβααΌαααβααΆααα·αααΈαααα"
## Parenthesis
assert tha.parenthesis.processor("Hello (this will be ignored) world") == "Hello world"
## Iteration Mark
def fake_tokenizer(_):
return ["ααΆαα", "ααΆα", "αα
", "ααααα·α
", "αααα"]
assert (
tha.repeater.processor("ααΆααααΆααα
ααααα·α
αααααα αΎα", tokenizer=fake_tokenizer)
== "ααΆααααΆααα
ααααα·α
ααααβααααα·α
ααααα αΎα"
)