fix: improve content-extraction for scheduling uuid detection (#520)

fix: improve content-extraction for scheduling uuid detection Fixes #519 misc.
MartinBernstorff · Jan 3, 2024 · b0ac0ce · b0ac0ce
2 parents 234ad32 + a112e11
commit b0ac0ce
Show file tree

Hide file tree

Showing 6 changed files with 50 additions and 13 deletions.
diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -21,6 +21,7 @@
             "justMyCode": true,
             "args": [
                 "--input-dir=/Users/Shared/life-lessons/docs/life-lessons",
+                "--max-deletions-per-run=0",
             ]
         }
     ]

diff --git a/memium/destination/ankiconnect/test_anki_converter.py b/memium/destination/ankiconnect/test_anki_converter.py
@@ -18,7 +18,7 @@
                 add_tags=["FakeTag"],
             ),
             FakeAnkiQA(
-                uuid=6947886967
+                uuid=4875918425
             ),  # Ensure that UUID generation remains stable to retain idempotency over time
         ),
         (

diff --git a/memium/source/extractors/test_prompt_extractor_qa.py b/memium/source/extractors/test_prompt_extractor_qa.py
@@ -26,4 +26,4 @@ def test_qa_prompt_extractor(tmpdir: Path):
     assert prompt.question == "What is the meaning of life?"
     assert prompt.answer == "42"
     assert prompt.tags == ["anki/tag/test_tag"]
-    assert prompt.scheduling_uid == 9385242780
+    assert prompt.scheduling_uid == 3643087944
diff --git a/memium/source/prompts/prompt_qa.py b/memium/source/prompts/prompt_qa.py
@@ -14,7 +14,7 @@ class QAPrompt(BasePrompt):
     @property
     def scheduling_uid_str(self) -> str:
         """Str used for generating the update_uid. Super helpful for debugging."""
-        return clean_str(f"{self.question}_{self.answer}")
+        return f"{clean_str(self.question)}_{clean_str(self.answer)}"
 
     @property
     def scheduling_uid(self) -> int:

diff --git a/memium/utils/hash_cleaned_str.py b/memium/utils/hash_cleaned_str.py
@@ -1,17 +1,25 @@
 import hashlib
+import re
 
 import unidecode
 from bs4 import BeautifulSoup
 
 
-def remove_spaces(text: str) -> str:
-    """Remove spaces from a string."""
-    return text.replace(" ", "")
+def replace_whitespace(text: str) -> str:
+    return "_".join(text.split())
 
 
-def remove_html_tags(text: str) -> str:
-    clean_text = BeautifulSoup(text, "html.parser").text
-    return clean_text
+def remove_non_content_html_tags(text: str) -> str:
+    """Remove non-content html tags from a string."""
+    soup = BeautifulSoup(text, "html.parser")
+
+    for img in soup.find_all("img"):
+        if img.get("src"):
+            img.replace_with(img["src"])
+        else:
+            img.delete()
+
+    return soup.text
 
 
 def remove_punctuation(text: str) -> str:
@@ -24,20 +32,37 @@ def remove_punctuation(text: str) -> str:
     return cleaned
 
 
+def remove_markdown_links(text: str) -> str:
+    """Get the text from markdown links, e.g. '[link text](link url)' -> 'link text'"""
+    return re.sub(r"\[([^\]]+)\]\([^\)]+\)", r"\1", text)
+
+
 def decode_unicode(text: str) -> str:
     """Standardise accents in a string."""
     return unidecode.unidecode(text)
 
 
+def remove_list_markup(text: str) -> str:
+    """Remove markdown list markup, including '1. Item', '*' and '-'."""
+    without_list_numbers = re.sub(r"^\s*\d+\. ", "", text, flags=re.MULTILINE)
+    without_bullets = re.sub(
+        r"^\s\* ", "", without_list_numbers, flags=re.MULTILINE
+    )
+    without_dashes = re.sub(r"^\s- ", "", without_bullets, flags=re.MULTILINE)
+    return without_dashes
+
+
 def clean_str(input_str: str) -> str:
     """Clean string before hashing, so changes to spacing, punctuation, newlines etc. do not affect the hash."""
     cleaned = input_str
 
     for cleaner in [
-        remove_html_tags,
+        remove_non_content_html_tags,
+        remove_markdown_links,
+        remove_list_markup,
         remove_punctuation,
-        remove_spaces,
         decode_unicode,
+        replace_whitespace,
     ]:
         cleaned = cleaner(cleaned)
 

diff --git a/memium/utils/test_hash_cleaned_str.py b/memium/utils/test_hash_cleaned_str.py
@@ -36,7 +36,18 @@ def test_hash_cleaned_str_should_remove_html_tags(
 
 
 @pytest.mark.parametrize(
-    ("input_str", "expected"), [("Is <2, but >4", "is2but4")]
+    ("input_str", "expected"),
+    [
+        ("Is <2, but >4", "is_2_but_4"),
+        ("Q. ![](src). Question?", "q_src_question"),
+        ("pre <img src='testsrc' /> post", "pre_testsrc_post"),
+        ("\t", ""),
+        ("\n", ""),
+        ("""1. One\n\t2. Two""", "one_two"),
+        ("""* One\n\t* Two""", "one_two"),
+        ("""- One\n\t- Two""", "one_two"),
+        ("[link1](blah) and [link2](blah)", "link1_and_link2"),
+    ],
 )
 def test_str_cleaner(input_str: str, expected: str):
-    assert clean_str(input_str) == clean_str(expected)
+    assert clean_str(input_str) == expected