Skip to content

Commit

Permalink
fix: improve content-extraction for scheduling uuid detection (#520)
Browse files Browse the repository at this point in the history
fix: improve content-extraction for scheduling uuid detection

Fixes #519

misc.
  • Loading branch information
MartinBernstorff authored Jan 3, 2024
2 parents 234ad32 + a112e11 commit b0ac0ce
Show file tree
Hide file tree
Showing 6 changed files with 50 additions and 13 deletions.
1 change: 1 addition & 0 deletions .vscode/launch.json
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
"justMyCode": true,
"args": [
"--input-dir=/Users/Shared/life-lessons/docs/life-lessons",
"--max-deletions-per-run=0",
]
}
]
Expand Down
2 changes: 1 addition & 1 deletion memium/destination/ankiconnect/test_anki_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
add_tags=["FakeTag"],
),
FakeAnkiQA(
uuid=6947886967
uuid=4875918425
), # Ensure that UUID generation remains stable to retain idempotency over time
),
(
Expand Down
2 changes: 1 addition & 1 deletion memium/source/extractors/test_prompt_extractor_qa.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,4 @@ def test_qa_prompt_extractor(tmpdir: Path):
assert prompt.question == "What is the meaning of life?"
assert prompt.answer == "42"
assert prompt.tags == ["anki/tag/test_tag"]
assert prompt.scheduling_uid == 9385242780
assert prompt.scheduling_uid == 3643087944
2 changes: 1 addition & 1 deletion memium/source/prompts/prompt_qa.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ class QAPrompt(BasePrompt):
@property
def scheduling_uid_str(self) -> str:
"""Str used for generating the update_uid. Super helpful for debugging."""
return clean_str(f"{self.question}_{self.answer}")
return f"{clean_str(self.question)}_{clean_str(self.answer)}"

@property
def scheduling_uid(self) -> int:
Expand Down
41 changes: 33 additions & 8 deletions memium/utils/hash_cleaned_str.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,25 @@
import hashlib
import re

import unidecode
from bs4 import BeautifulSoup


def remove_spaces(text: str) -> str:
"""Remove spaces from a string."""
return text.replace(" ", "")
def replace_whitespace(text: str) -> str:
return "_".join(text.split())


def remove_html_tags(text: str) -> str:
clean_text = BeautifulSoup(text, "html.parser").text
return clean_text
def remove_non_content_html_tags(text: str) -> str:
"""Remove non-content html tags from a string."""
soup = BeautifulSoup(text, "html.parser")

for img in soup.find_all("img"):
if img.get("src"):
img.replace_with(img["src"])
else:
img.delete()

return soup.text


def remove_punctuation(text: str) -> str:
Expand All @@ -24,20 +32,37 @@ def remove_punctuation(text: str) -> str:
return cleaned


def remove_markdown_links(text: str) -> str:
"""Get the text from markdown links, e.g. '[link text](link url)' -> 'link text'"""
return re.sub(r"\[([^\]]+)\]\([^\)]+\)", r"\1", text)


def decode_unicode(text: str) -> str:
"""Standardise accents in a string."""
return unidecode.unidecode(text)


def remove_list_markup(text: str) -> str:
"""Remove markdown list markup, including '1. Item', '*' and '-'."""
without_list_numbers = re.sub(r"^\s*\d+\. ", "", text, flags=re.MULTILINE)
without_bullets = re.sub(
r"^\s\* ", "", without_list_numbers, flags=re.MULTILINE
)
without_dashes = re.sub(r"^\s- ", "", without_bullets, flags=re.MULTILINE)
return without_dashes


def clean_str(input_str: str) -> str:
"""Clean string before hashing, so changes to spacing, punctuation, newlines etc. do not affect the hash."""
cleaned = input_str

for cleaner in [
remove_html_tags,
remove_non_content_html_tags,
remove_markdown_links,
remove_list_markup,
remove_punctuation,
remove_spaces,
decode_unicode,
replace_whitespace,
]:
cleaned = cleaner(cleaned)

Expand Down
15 changes: 13 additions & 2 deletions memium/utils/test_hash_cleaned_str.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,18 @@ def test_hash_cleaned_str_should_remove_html_tags(


@pytest.mark.parametrize(
("input_str", "expected"), [("Is <2, but >4", "is2but4")]
("input_str", "expected"),
[
("Is <2, but >4", "is_2_but_4"),
("Q. ![](src). Question?", "q_src_question"),
("pre <img src='testsrc' /> post", "pre_testsrc_post"),
("\t", ""),
("\n", ""),
("""1. One\n\t2. Two""", "one_two"),
("""* One\n\t* Two""", "one_two"),
("""- One\n\t- Two""", "one_two"),
("[link1](blah) and [link2](blah)", "link1_and_link2"),
],
)
def test_str_cleaner(input_str: str, expected: str):
assert clean_str(input_str) == clean_str(expected)
assert clean_str(input_str) == expected

0 comments on commit b0ac0ce

Please sign in to comment.