Skip to content

Commit

Permalink
Improve text_collapse performance
Browse files Browse the repository at this point in the history
  • Loading branch information
titusz committed Oct 20, 2024
1 parent 76dc011 commit 366185f
Showing 1 changed file with 9 additions and 11 deletions.
20 changes: 9 additions & 11 deletions iscc_core/code_content_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
- Count characters of collapsed text
- Apply [`soft_hash_text_v0`][iscc_core.code_content_text.soft_hash_text_v0] to collapsed text
"""

import unicodedata
import xxhash
import iscc_core as ic
Expand Down Expand Up @@ -124,18 +125,15 @@ def text_collapse(text):
:rtype: str
"""

# Decompose with NFD
text = unicodedata.normalize("NFD", text)
# Decompose with NFD and convert to lower case
text = unicodedata.normalize("NFD", text).lower()

# Remove all whitespace and convert text to lower case
text = "".join(text.split()).lower()
# Remove whitespace and filter characters in one pass
filtered_chars = []

# Filter control characters, marks (diacritics), and punctuation
text = "".join(
ch for ch in text if unicodedata.category(ch)[0] not in ic.core_opts.text_unicode_filter
)
for ch in text:
if not ch.isspace() and unicodedata.category(ch)[0] not in ic.core_opts.text_unicode_filter:
filtered_chars.append(ch)

# Recombine
text = unicodedata.normalize("NFKC", text)

return text
return unicodedata.normalize("NFKC", "".join(filtered_chars))

0 comments on commit 366185f

Please sign in to comment.