diff --git a/personal_mnemonic_medium/source/document_ingester.py b/personal_mnemonic_medium/source/document_ingester.py index 4492701a..ca6489a6 100644 --- a/personal_mnemonic_medium/source/document_ingester.py +++ b/personal_mnemonic_medium/source/document_ingester.py @@ -1,4 +1,6 @@ +import logging from collections.abc import Sequence +from dataclasses import dataclass from pathlib import Path from typing import Protocol @@ -6,6 +8,14 @@ from .document import Document +log = logging.getLogger(__name__) + + +@dataclass(frozen=True) +class FileNotRetrievedError(Exception): + path: Path + error: Exception + class BaseDocumentIngester(Protocol): def get_documents(self) -> Sequence[Document]: @@ -16,19 +26,28 @@ class MarkdownDocumentIngester(BaseDocumentIngester): def __init__(self, directory: Path) -> None: self.directory = directory - def get_note_from_file(self, file_path: Path) -> Document: - with file_path.open(encoding="utf8") as f: - file_contents = f.read() - - return Document(content=file_contents, source_path=file_path) + def _get_note_from_file( + self, file_path: Path + ) -> Document | FileNotRetrievedError: + try: + with file_path.open("r", encoding="utf8") as f: + return Document(content=f.read(), source_path=file_path) + except Exception as e: + log.warning(f"Could not retrieve {file_path}: {e}") + return FileNotRetrievedError(file_path, e) def get_documents(self) -> Sequence[Document]: md_files = list(self.directory.rglob("*.md")) - notes: list[Document] = [] + notes: list[Document | FileNotRetrievedError] = [] + with tqdm(total=len(md_files)) as pbar: for filepath in md_files: - notes.append(self.get_note_from_file(filepath)) + notes.append(self._get_note_from_file(filepath)) pbar.update(1) - return notes + return [ + note + for note in notes + if not isinstance(note, FileNotRetrievedError) + ] diff --git a/personal_mnemonic_medium/source/test_document_ingester.py b/personal_mnemonic_medium/source/test_document_ingester.py new file mode 100644 index 00000000..c67447de --- /dev/null +++ b/personal_mnemonic_medium/source/test_document_ingester.py @@ -0,0 +1,45 @@ +import logging +from pathlib import Path + +import pytest + +from .document_ingester import MarkdownDocumentIngester + + +class TestMarkdownIngester: + def test_happy_path(self, tmp_path: Path): + with (tmp_path / "test.md").open("w") as f: + f.write( + """# Hello World\n + #anki/tag/test_tag #anki/tag/test_tag2 + """ + ) + + documents = MarkdownDocumentIngester(directory=tmp_path).get_documents() + + assert len(documents) == 1 + document = documents[0] + assert document.title == "test" + assert document.tags == [ + "anki/tag/test_tag", + "anki/tag/test_tag2", + "comment_tag", + ] + + def test_should_log_error_if_file_not_retrieved( + self, tmp_path: Path, caplog: pytest.LogCaptureFixture + ): + caplog.set_level(logging.WARNING) + + (tmp_path / "succesful.md").write_text("""# I should succeed""") + + failed_path = tmp_path / "failed.md" + failed_path.write_text("""# I should fail""") + # Change permission to write only + # Removing read and execute permissions for owner, group and others + failed_path.chmod(0o200) + + MarkdownDocumentIngester(directory=tmp_path)._get_note_from_file( # type: ignore[PrivateMethodUsage] + Path("I do not exist") + ) + assert "I do not exist" in caplog.records[0].message diff --git a/personal_mnemonic_medium/source/test_ingester_markdown.py b/personal_mnemonic_medium/source/test_ingester_markdown.py deleted file mode 100644 index 1747312c..00000000 --- a/personal_mnemonic_medium/source/test_ingester_markdown.py +++ /dev/null @@ -1,24 +0,0 @@ -from pathlib import Path - -from .document_ingester import MarkdownDocumentIngester - - -def test_markdown_document_ingester(tmpdir: Path): - ingester = MarkdownDocumentIngester(directory=Path(tmpdir)) - - with (Path(tmpdir) / "test.md").open("w") as f: - f.write( - """# Hello World\n -#anki/tag/test_tag #anki/tag/test_tag2 -""" - ) - - documents = ingester.get_documents() - assert len(documents) == 1 - document = documents[0] - assert document.title == "test" - assert document.tags == [ - "anki/tag/test_tag", - "anki/tag/test_tag2", - "comment_tag", - ]