From 4acfd598ddf12df573f5151e6e3374209f91bf1c Mon Sep 17 00:00:00 2001
From: Martin Bernstorff <martinbernstorff@gmail.com>
Date: Wed, 3 Jan 2024 16:43:59 +0100
Subject: [PATCH 1/3] fix: improve content-extraction for scheduling uuid
 detection

Fixes #519
---
 .vscode/launch.json                   |  1 +
 memium/source/prompts/prompt_qa.py    |  2 +-
 memium/utils/hash_cleaned_str.py      | 37 +++++++++++++++++++++------
 memium/utils/test_hash_cleaned_str.py | 13 ++++++++--
 4 files changed, 42 insertions(+), 11 deletions(-)

diff --git a/.vscode/launch.json b/.vscode/launch.json
index d3c10a3..77f4e30 100644
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -21,6 +21,7 @@
             "justMyCode": true,
             "args": [
                 "--input-dir=/Users/Shared/life-lessons/docs/life-lessons",
+                "--max-deletions-per-run=0",
             ]
         }
     ]
diff --git a/memium/source/prompts/prompt_qa.py b/memium/source/prompts/prompt_qa.py
index 6433543..b7c77e7 100644
--- a/memium/source/prompts/prompt_qa.py
+++ b/memium/source/prompts/prompt_qa.py
@@ -14,7 +14,7 @@ class QAPrompt(BasePrompt):
     @property
     def scheduling_uid_str(self) -> str:
         """Str used for generating the update_uid. Super helpful for debugging."""
-        return clean_str(f"{self.question}_{self.answer}")
+        return f"{clean_str(self.question)}_{clean_str(self.answer)}"
 
     @property
     def scheduling_uid(self) -> int:
diff --git a/memium/utils/hash_cleaned_str.py b/memium/utils/hash_cleaned_str.py
index b437089..2927f44 100644
--- a/memium/utils/hash_cleaned_str.py
+++ b/memium/utils/hash_cleaned_str.py
@@ -1,17 +1,25 @@
 import hashlib
+import re
 
 import unidecode
 from bs4 import BeautifulSoup
 
 
-def remove_spaces(text: str) -> str:
-    """Remove spaces from a string."""
-    return text.replace(" ", "")
+def replace_whitespace(text: str) -> str:
+    return "_".join(text.split())
 
 
-def remove_html_tags(text: str) -> str:
-    clean_text = BeautifulSoup(text, "html.parser").text
-    return clean_text
+def remove_non_content_html_tags(text: str) -> str:
+    """Remove non-content html tags from a string."""
+    soup = BeautifulSoup(text, "html.parser")
+
+    for img in soup.find_all("img"):
+        if img.get("src"):
+            img.replace_with(img["src"])
+        else:
+            img.delete()
+
+    return soup.text
 
 
 def remove_punctuation(text: str) -> str:
@@ -24,20 +32,33 @@ def remove_punctuation(text: str) -> str:
     return cleaned
 
 
+def remove_markdown_links(text: str) -> str:
+    """Get the text from markdown links, e.g. '[link text](link url)' -> 'link text'"""
+    return re.sub(r"\[([^\]]+)\]\([^\)]+\)", r"\1", text)
+
+
 def decode_unicode(text: str) -> str:
     """Standardise accents in a string."""
     return unidecode.unidecode(text)
 
 
+def remove_list_markup(text: str) -> str:
+    """Remove markdown list markup, including '1. Item', '*' and '-'."""
+    without_list_numbers = re.sub(r"^\d+\. ", "", text, flags=re.MULTILINE)
+    return without_list_numbers.replace("* ", "").replace("- ", "")
+
+
 def clean_str(input_str: str) -> str:
     """Clean string before hashing, so changes to spacing, punctuation, newlines etc. do not affect the hash."""
     cleaned = input_str
 
     for cleaner in [
-        remove_html_tags,
+        remove_non_content_html_tags,
+        remove_markdown_links,
+        remove_list_markup,
         remove_punctuation,
-        remove_spaces,
         decode_unicode,
+        replace_whitespace,
     ]:
         cleaned = cleaner(cleaned)
 
diff --git a/memium/utils/test_hash_cleaned_str.py b/memium/utils/test_hash_cleaned_str.py
index 77f3f20..d768249 100644
--- a/memium/utils/test_hash_cleaned_str.py
+++ b/memium/utils/test_hash_cleaned_str.py
@@ -36,7 +36,16 @@ def test_hash_cleaned_str_should_remove_html_tags(
 
 
 @pytest.mark.parametrize(
-    ("input_str", "expected"), [("Is <2, but >4", "is2but4")]
+    ("input_str", "expected"),
+    [
+        ("Is <2, but >4", "is_2_but_4"),
+        ("Q. ![](src). Question?", "q_src_question"),
+        ("pre <img src='testsrc' /> post", "pre_testsrc_post"),
+        ("\t", ""),
+        ("\n", ""),
+        ("""1. One\n2. Two""", "one_two"),
+        ("[link1](blah) and [link2](blah)", "link1_and_link2"),
+    ],
 )
 def test_str_cleaner(input_str: str, expected: str):
-    assert clean_str(input_str) == clean_str(expected)
+    assert clean_str(input_str) == expected

From f53455c8a707eb0f5b2a085d734e418bdac0d967 Mon Sep 17 00:00:00 2001
From: Martin Bernstorff <martinbernstorff@gmail.com>
Date: Wed, 3 Jan 2024 15:45:46 +0000
Subject: [PATCH 2/3] misc.

---
 memium/destination/ankiconnect/test_anki_converter.py | 2 +-
 memium/source/extractors/test_prompt_extractor_qa.py  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/memium/destination/ankiconnect/test_anki_converter.py b/memium/destination/ankiconnect/test_anki_converter.py
index 33bf9c2..f72e19c 100644
--- a/memium/destination/ankiconnect/test_anki_converter.py
+++ b/memium/destination/ankiconnect/test_anki_converter.py
@@ -18,7 +18,7 @@
                 add_tags=["FakeTag"],
             ),
             FakeAnkiQA(
-                uuid=6947886967
+                uuid=4875918425
             ),  # Ensure that UUID generation remains stable to retain idempotency over time
         ),
         (
diff --git a/memium/source/extractors/test_prompt_extractor_qa.py b/memium/source/extractors/test_prompt_extractor_qa.py
index 0a8c557..864f9bb 100644
--- a/memium/source/extractors/test_prompt_extractor_qa.py
+++ b/memium/source/extractors/test_prompt_extractor_qa.py
@@ -26,4 +26,4 @@ def test_qa_prompt_extractor(tmpdir: Path):
     assert prompt.question == "What is the meaning of life?"
     assert prompt.answer == "42"
     assert prompt.tags == ["anki/tag/test_tag"]
-    assert prompt.scheduling_uid == 9385242780
+    assert prompt.scheduling_uid == 3643087944

From a112e1187f5898f8d55d86a279a27c770f3561fe Mon Sep 17 00:00:00 2001
From: Martin Bernstorff <martinbernstorff@gmail.com>
Date: Wed, 3 Jan 2024 15:58:47 +0000
Subject: [PATCH 3/3] fix: only remove list markup if at beginning of line

---
 memium/utils/hash_cleaned_str.py      | 8 ++++++--
 memium/utils/test_hash_cleaned_str.py | 4 +++-
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/memium/utils/hash_cleaned_str.py b/memium/utils/hash_cleaned_str.py
index 2927f44..4351645 100644
--- a/memium/utils/hash_cleaned_str.py
+++ b/memium/utils/hash_cleaned_str.py
@@ -44,8 +44,12 @@ def decode_unicode(text: str) -> str:
 
 def remove_list_markup(text: str) -> str:
     """Remove markdown list markup, including '1. Item', '*' and '-'."""
-    without_list_numbers = re.sub(r"^\d+\. ", "", text, flags=re.MULTILINE)
-    return without_list_numbers.replace("* ", "").replace("- ", "")
+    without_list_numbers = re.sub(r"^\s*\d+\. ", "", text, flags=re.MULTILINE)
+    without_bullets = re.sub(
+        r"^\s\* ", "", without_list_numbers, flags=re.MULTILINE
+    )
+    without_dashes = re.sub(r"^\s- ", "", without_bullets, flags=re.MULTILINE)
+    return without_dashes
 
 
 def clean_str(input_str: str) -> str:
diff --git a/memium/utils/test_hash_cleaned_str.py b/memium/utils/test_hash_cleaned_str.py
index d768249..fc0c860 100644
--- a/memium/utils/test_hash_cleaned_str.py
+++ b/memium/utils/test_hash_cleaned_str.py
@@ -43,7 +43,9 @@ def test_hash_cleaned_str_should_remove_html_tags(
         ("pre <img src='testsrc' /> post", "pre_testsrc_post"),
         ("\t", ""),
         ("\n", ""),
-        ("""1. One\n2. Two""", "one_two"),
+        ("""1. One\n\t2. Two""", "one_two"),
+        ("""* One\n\t* Two""", "one_two"),
+        ("""- One\n\t- Two""", "one_two"),
         ("[link1](blah) and [link2](blah)", "link1_and_link2"),
     ],
 )