From 6a101d40e35a0dd00da42ab77e419ecbabfdcbe4 Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Sun, 27 Oct 2024 14:08:18 +0100 Subject: [PATCH] fix: improve logging of parser errors (#745) --- memium/source/extractors/extractor_qa.py | 6 +++++- memium/source/prompt_source.py | 7 ++++++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/memium/source/extractors/extractor_qa.py b/memium/source/extractors/extractor_qa.py index a72f9c1..b7155bc 100644 --- a/memium/source/extractors/extractor_qa.py +++ b/memium/source/extractors/extractor_qa.py @@ -53,13 +53,14 @@ def extract_prompts(self, document: Document) -> Sequence[QAPrompt]: blocks = self._string_to_blocks_by_newlines(document.content) block_starting_line_nr = 1 + missing_answers: list[str] = [] for block_string in blocks: if self._has_qa(block_string): question = self._get_first_question(block_string) try: answer = self._get_first_answer(block_string) except IndexError: - logging.warning(f"Could not find answer in {document.title} for {question}") + missing_answers.append(f"{document.title}: {question}") continue prompts.append( @@ -74,4 +75,7 @@ def extract_prompts(self, document: Document) -> Sequence[QAPrompt]: block_lines = len(re.findall(r"\n", block_string, flags=re.DOTALL)) block_starting_line_nr += block_lines + if missing_answers: + logging.warning(f"{missing_answers} is missing an answer") + return prompts diff --git a/memium/source/prompt_source.py b/memium/source/prompt_source.py index ae8febe..4cd2b0d 100644 --- a/memium/source/prompt_source.py +++ b/memium/source/prompt_source.py @@ -40,7 +40,12 @@ def _deduplicate_group(self, group: tuple[str, Sequence[BasePrompt]]) -> BasePro prompts_in_group = group[1] if len(prompts_in_group) != 1: - log.warning(f"Found duplicate prompts for {prompts_in_group[0]}") + identifier = ( + prompts_in_group[0].edit_url + if prompts_in_group[0].edit_url + else prompts_in_group[0] + ) + log.warning(f"Found duplicate prompts for {identifier}. Prompts: {prompts_in_group}") return prompts_in_group[0]