From 366185f645904d891e7e5afd10f8bb65abaf328c Mon Sep 17 00:00:00 2001 From: Titusz Pan Date: Sun, 20 Oct 2024 09:22:18 +0200 Subject: [PATCH] Improve text_collapse performance --- iscc_core/code_content_text.py | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/iscc_core/code_content_text.py b/iscc_core/code_content_text.py index e410468..4cf211d 100644 --- a/iscc_core/code_content_text.py +++ b/iscc_core/code_content_text.py @@ -17,6 +17,7 @@ - Count characters of collapsed text - Apply [`soft_hash_text_v0`][iscc_core.code_content_text.soft_hash_text_v0] to collapsed text """ + import unicodedata import xxhash import iscc_core as ic @@ -124,18 +125,15 @@ def text_collapse(text): :rtype: str """ - # Decompose with NFD - text = unicodedata.normalize("NFD", text) + # Decompose with NFD and convert to lower case + text = unicodedata.normalize("NFD", text).lower() - # Remove all whitespace and convert text to lower case - text = "".join(text.split()).lower() + # Remove whitespace and filter characters in one pass + filtered_chars = [] - # Filter control characters, marks (diacritics), and punctuation - text = "".join( - ch for ch in text if unicodedata.category(ch)[0] not in ic.core_opts.text_unicode_filter - ) + for ch in text: + if not ch.isspace() and unicodedata.category(ch)[0] not in ic.core_opts.text_unicode_filter: + filtered_chars.append(ch) # Recombine - text = unicodedata.normalize("NFKC", text) - - return text + return unicodedata.normalize("NFKC", "".join(filtered_chars))