Skip to content

Commit

Permalink
fix: false positives on brackets (#496)
Browse files Browse the repository at this point in the history
fix: false positives on brackets

Fixes #495

fix: remove entire code blocks
  • Loading branch information
MartinBernstorff authored Dec 27, 2023
2 parents 3ecc92e + 780ec6f commit 5606322
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 13 deletions.
27 changes: 14 additions & 13 deletions memium/source/extractors/extractor_cloze.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,24 +14,21 @@ class ClozePromptExtractor(BasePromptExtractor):
@staticmethod
def _get_blocks(string: str) -> list[str]:
"""Break string into a list by 2+ newlines in a row."""
return re.split(r"(\n\n)+", string)
# Exclude entire code blocks
string_sans_code_blocks = re.sub(
r"```.*?```", "", string, flags=re.DOTALL
)
return re.split(r"(\n\n)+", string_sans_code_blocks)

@staticmethod
def _has_cloze(string: str) -> bool:
if (
len(re.findall(r"{.*}", string)) > 0
and "BearID" not in string # Exclude BearID
and "$$" not in string # Exclude math
and r"```" not in string # Exclude code
and "Q." not in string # Exclude Q&A
and "A." not in string # Exclude Q&A
):
if len(re.findall(r"{.*}", string)) > 0:
return True
return False

@staticmethod
def _is_code_block(string: str) -> bool:
if string.startswith("```"):
def _is_math_block(string: str) -> bool:
if string.startswith("$$"):
return True
return False

Expand Down Expand Up @@ -75,8 +72,12 @@ def extract_prompts(self, document: Document) -> Sequence[ClozePrompt]:
blocks = self._get_blocks(document.content)

for block_string in blocks:
if self._is_code_block(block_string) or self._is_html_comment(
block_string
if any(
exclusion_criterion(block_string)
for exclusion_criterion in (
self._is_html_comment,
self._is_math_block,
)
):
continue
if self._has_cloze(block_string):
Expand Down
8 changes: 8 additions & 0 deletions memium/source/extractors/test_prompt_extractor_cloze.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,14 @@ def test_cloze_prompt_extractor(tmpdir: Path):
),
("""<!-- {HTML comment} -->""", True),
("""{Cloze}""", False),
(
"""```html
Some content
Content in another {block}
```""",
True,
),
],
)
def test_ignore_block_types(content: str, skipped: bool):
Expand Down

0 comments on commit 5606322

Please sign in to comment.