Skip to content

Commit

Permalink
feat: change document ingestion to best-effort (#416)
Browse files Browse the repository at this point in the history
feat: change document ingestion to best-effort

feat: implement
  • Loading branch information
MartinBernstorff authored Dec 23, 2023
2 parents 0acd631 + bb00c4c commit bbe1196
Show file tree
Hide file tree
Showing 3 changed files with 72 additions and 32 deletions.
35 changes: 27 additions & 8 deletions personal_mnemonic_medium/source/document_ingester.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,21 @@
import logging
from collections.abc import Sequence
from dataclasses import dataclass
from pathlib import Path
from typing import Protocol

from tqdm import tqdm

from .document import Document

log = logging.getLogger(__name__)


@dataclass(frozen=True)
class FileNotRetrievedError(Exception):
path: Path
error: Exception


class BaseDocumentIngester(Protocol):
def get_documents(self) -> Sequence[Document]:
Expand All @@ -16,19 +26,28 @@ class MarkdownDocumentIngester(BaseDocumentIngester):
def __init__(self, directory: Path) -> None:
self.directory = directory

def get_note_from_file(self, file_path: Path) -> Document:
with file_path.open(encoding="utf8") as f:
file_contents = f.read()

return Document(content=file_contents, source_path=file_path)
def _get_note_from_file(
self, file_path: Path
) -> Document | FileNotRetrievedError:
try:
with file_path.open("r", encoding="utf8") as f:
return Document(content=f.read(), source_path=file_path)
except Exception as e:
log.warning(f"Could not retrieve {file_path}: {e}")
return FileNotRetrievedError(file_path, e)

def get_documents(self) -> Sequence[Document]:
md_files = list(self.directory.rglob("*.md"))

notes: list[Document] = []
notes: list[Document | FileNotRetrievedError] = []

with tqdm(total=len(md_files)) as pbar:
for filepath in md_files:
notes.append(self.get_note_from_file(filepath))
notes.append(self._get_note_from_file(filepath))
pbar.update(1)

return notes
return [
note
for note in notes
if not isinstance(note, FileNotRetrievedError)
]
45 changes: 45 additions & 0 deletions personal_mnemonic_medium/source/test_document_ingester.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import logging
from pathlib import Path

import pytest

from .document_ingester import MarkdownDocumentIngester


class TestMarkdownIngester:
def test_happy_path(self, tmp_path: Path):
with (tmp_path / "test.md").open("w") as f:
f.write(
"""# Hello World\n
#anki/tag/test_tag #anki/tag/test_tag2 <!-- #comment_tag -->
"""
)

documents = MarkdownDocumentIngester(directory=tmp_path).get_documents()

assert len(documents) == 1
document = documents[0]
assert document.title == "test"
assert document.tags == [
"anki/tag/test_tag",
"anki/tag/test_tag2",
"comment_tag",
]

def test_should_log_error_if_file_not_retrieved(
self, tmp_path: Path, caplog: pytest.LogCaptureFixture
):
caplog.set_level(logging.WARNING)

(tmp_path / "succesful.md").write_text("""# I should succeed""")

failed_path = tmp_path / "failed.md"
failed_path.write_text("""# I should fail""")
# Change permission to write only
# Removing read and execute permissions for owner, group and others
failed_path.chmod(0o200)

MarkdownDocumentIngester(directory=tmp_path)._get_note_from_file( # type: ignore[PrivateMethodUsage]
Path("I do not exist")
)
assert "I do not exist" in caplog.records[0].message
24 changes: 0 additions & 24 deletions personal_mnemonic_medium/source/test_ingester_markdown.py

This file was deleted.

0 comments on commit bbe1196

Please sign in to comment.