From 4acfd598ddf12df573f5151e6e3374209f91bf1c Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Wed, 3 Jan 2024 16:43:59 +0100 Subject: [PATCH 1/3] fix: improve content-extraction for scheduling uuid detection Fixes #519 --- .vscode/launch.json | 1 + memium/source/prompts/prompt_qa.py | 2 +- memium/utils/hash_cleaned_str.py | 37 +++++++++++++++++++++------ memium/utils/test_hash_cleaned_str.py | 13 ++++++++-- 4 files changed, 42 insertions(+), 11 deletions(-) diff --git a/.vscode/launch.json b/.vscode/launch.json index d3c10a3..77f4e30 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -21,6 +21,7 @@ "justMyCode": true, "args": [ "--input-dir=/Users/Shared/life-lessons/docs/life-lessons", + "--max-deletions-per-run=0", ] } ] diff --git a/memium/source/prompts/prompt_qa.py b/memium/source/prompts/prompt_qa.py index 6433543..b7c77e7 100644 --- a/memium/source/prompts/prompt_qa.py +++ b/memium/source/prompts/prompt_qa.py @@ -14,7 +14,7 @@ class QAPrompt(BasePrompt): @property def scheduling_uid_str(self) -> str: """Str used for generating the update_uid. Super helpful for debugging.""" - return clean_str(f"{self.question}_{self.answer}") + return f"{clean_str(self.question)}_{clean_str(self.answer)}" @property def scheduling_uid(self) -> int: diff --git a/memium/utils/hash_cleaned_str.py b/memium/utils/hash_cleaned_str.py index b437089..2927f44 100644 --- a/memium/utils/hash_cleaned_str.py +++ b/memium/utils/hash_cleaned_str.py @@ -1,17 +1,25 @@ import hashlib +import re import unidecode from bs4 import BeautifulSoup -def remove_spaces(text: str) -> str: - """Remove spaces from a string.""" - return text.replace(" ", "") +def replace_whitespace(text: str) -> str: + return "_".join(text.split()) -def remove_html_tags(text: str) -> str: - clean_text = BeautifulSoup(text, "html.parser").text - return clean_text +def remove_non_content_html_tags(text: str) -> str: + """Remove non-content html tags from a string.""" + soup = BeautifulSoup(text, "html.parser") + + for img in soup.find_all("img"): + if img.get("src"): + img.replace_with(img["src"]) + else: + img.delete() + + return soup.text def remove_punctuation(text: str) -> str: @@ -24,20 +32,33 @@ def remove_punctuation(text: str) -> str: return cleaned +def remove_markdown_links(text: str) -> str: + """Get the text from markdown links, e.g. '[link text](link url)' -> 'link text'""" + return re.sub(r"\[([^\]]+)\]\([^\)]+\)", r"\1", text) + + def decode_unicode(text: str) -> str: """Standardise accents in a string.""" return unidecode.unidecode(text) +def remove_list_markup(text: str) -> str: + """Remove markdown list markup, including '1. Item', '*' and '-'.""" + without_list_numbers = re.sub(r"^\d+\. ", "", text, flags=re.MULTILINE) + return without_list_numbers.replace("* ", "").replace("- ", "") + + def clean_str(input_str: str) -> str: """Clean string before hashing, so changes to spacing, punctuation, newlines etc. do not affect the hash.""" cleaned = input_str for cleaner in [ - remove_html_tags, + remove_non_content_html_tags, + remove_markdown_links, + remove_list_markup, remove_punctuation, - remove_spaces, decode_unicode, + replace_whitespace, ]: cleaned = cleaner(cleaned) diff --git a/memium/utils/test_hash_cleaned_str.py b/memium/utils/test_hash_cleaned_str.py index 77f3f20..d768249 100644 --- a/memium/utils/test_hash_cleaned_str.py +++ b/memium/utils/test_hash_cleaned_str.py @@ -36,7 +36,16 @@ def test_hash_cleaned_str_should_remove_html_tags( @pytest.mark.parametrize( - ("input_str", "expected"), [("Is <2, but >4", "is2but4")] + ("input_str", "expected"), + [ + ("Is <2, but >4", "is_2_but_4"), + ("Q. ![](src). Question?", "q_src_question"), + ("pre post", "pre_testsrc_post"), + ("\t", ""), + ("\n", ""), + ("""1. One\n2. Two""", "one_two"), + ("[link1](blah) and [link2](blah)", "link1_and_link2"), + ], ) def test_str_cleaner(input_str: str, expected: str): - assert clean_str(input_str) == clean_str(expected) + assert clean_str(input_str) == expected From f53455c8a707eb0f5b2a085d734e418bdac0d967 Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Wed, 3 Jan 2024 15:45:46 +0000 Subject: [PATCH 2/3] misc. --- memium/destination/ankiconnect/test_anki_converter.py | 2 +- memium/source/extractors/test_prompt_extractor_qa.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/memium/destination/ankiconnect/test_anki_converter.py b/memium/destination/ankiconnect/test_anki_converter.py index 33bf9c2..f72e19c 100644 --- a/memium/destination/ankiconnect/test_anki_converter.py +++ b/memium/destination/ankiconnect/test_anki_converter.py @@ -18,7 +18,7 @@ add_tags=["FakeTag"], ), FakeAnkiQA( - uuid=6947886967 + uuid=4875918425 ), # Ensure that UUID generation remains stable to retain idempotency over time ), ( diff --git a/memium/source/extractors/test_prompt_extractor_qa.py b/memium/source/extractors/test_prompt_extractor_qa.py index 0a8c557..864f9bb 100644 --- a/memium/source/extractors/test_prompt_extractor_qa.py +++ b/memium/source/extractors/test_prompt_extractor_qa.py @@ -26,4 +26,4 @@ def test_qa_prompt_extractor(tmpdir: Path): assert prompt.question == "What is the meaning of life?" assert prompt.answer == "42" assert prompt.tags == ["anki/tag/test_tag"] - assert prompt.scheduling_uid == 9385242780 + assert prompt.scheduling_uid == 3643087944 From a112e1187f5898f8d55d86a279a27c770f3561fe Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Wed, 3 Jan 2024 15:58:47 +0000 Subject: [PATCH 3/3] fix: only remove list markup if at beginning of line --- memium/utils/hash_cleaned_str.py | 8 ++++++-- memium/utils/test_hash_cleaned_str.py | 4 +++- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/memium/utils/hash_cleaned_str.py b/memium/utils/hash_cleaned_str.py index 2927f44..4351645 100644 --- a/memium/utils/hash_cleaned_str.py +++ b/memium/utils/hash_cleaned_str.py @@ -44,8 +44,12 @@ def decode_unicode(text: str) -> str: def remove_list_markup(text: str) -> str: """Remove markdown list markup, including '1. Item', '*' and '-'.""" - without_list_numbers = re.sub(r"^\d+\. ", "", text, flags=re.MULTILINE) - return without_list_numbers.replace("* ", "").replace("- ", "") + without_list_numbers = re.sub(r"^\s*\d+\. ", "", text, flags=re.MULTILINE) + without_bullets = re.sub( + r"^\s\* ", "", without_list_numbers, flags=re.MULTILINE + ) + without_dashes = re.sub(r"^\s- ", "", without_bullets, flags=re.MULTILINE) + return without_dashes def clean_str(input_str: str) -> str: diff --git a/memium/utils/test_hash_cleaned_str.py b/memium/utils/test_hash_cleaned_str.py index d768249..fc0c860 100644 --- a/memium/utils/test_hash_cleaned_str.py +++ b/memium/utils/test_hash_cleaned_str.py @@ -43,7 +43,9 @@ def test_hash_cleaned_str_should_remove_html_tags( ("pre post", "pre_testsrc_post"), ("\t", ""), ("\n", ""), - ("""1. One\n2. Two""", "one_two"), + ("""1. One\n\t2. Two""", "one_two"), + ("""* One\n\t* Two""", "one_two"), + ("""- One\n\t- Two""", "one_two"), ("[link1](blah) and [link2](blah)", "link1_and_link2"), ], )