StampyAI · Thomas-Lemoine · Sep 13, 2023 · Jul 19, 2023 · Jul 19, 2023 · Jul 19, 2023
diff --git a/.gitignore b/.gitignore
@@ -123,4 +123,6 @@ carado.moe/
 *.epub
 
 credentials.json
-data/raw/
+data/raw/
+
+*.log
diff --git a/align_data/analysis/analyse_jsonl_data.py b/align_data/analysis/analyse_jsonl_data.py
@@ -1,8 +1,9 @@
 from datetime import datetime
 from pathlib import Path
+from collections import defaultdict
+
 import jsonlines
 
-from collections import defaultdict
 
 
 def is_valid_date_format(data_dict, format="%Y-%m-%dT%H:%M:%SZ"):

diff --git a/align_data/analysis/count_tokens.py b/align_data/analysis/count_tokens.py
@@ -1,3 +1,5 @@
+from typing import Tuple
+
 from transformers import AutoTokenizer
 import jsonlines
 import logging

diff --git a/align_data/common/alignment_dataset.py b/align_data/common/alignment_dataset.py
@@ -4,30 +4,20 @@
 import time
 from dataclasses import dataclass, field, KW_ONLY
 from pathlib import Path
-from typing import Iterable, List, Optional, Set
-from sqlalchemy import select
-from sqlalchemy.exc import IntegrityError
-from sqlalchemy.orm import joinedload
+from typing import List, Optional, Dict, Any, Set, Iterable, Tuple
+import pytz
+from datetime import datetime
 
+from sqlalchemy import select, Select, JSON
+from sqlalchemy.exc import IntegrityError
+from sqlalchemy.orm import joinedload, Session
 import jsonlines
-import pytz
 from dateutil.parser import parse, ParserError
 from tqdm import tqdm
+
 from align_data.db.models import Article, Summary
 from align_data.db.session import make_session
-
-INIT_DICT = {
-    "source": None,
-    "id": None,
-    "text": None,
-    "date_published": None,
-    "title": None,
-    "url": None,
-    "authors": lambda: [],
-    "source_type": None,
-    "status": None,
-    "comments": None,
-}
+from align_data.settings import ARTICLE_MAIN_KEYS
 
 logger = logging.getLogger(__name__)
 
@@ -41,7 +31,16 @@ class AlignmentDataset:
 
     _: KW_ONLY
 
-    files_path = Path("")
+    id_fields: List[str] = field(default_factory=lambda: ["url", "title"])
+    """A list of fields to use as the id of the entry. If not set, will use ['url', 'title']"""
+
+    # Internal housekeeping variables
+    _outputted_items: Set[str] = field(default_factory=set)
+    """A set of the ids of all previously processed items"""
+
+    data_path: Path = field(init=False)
+    raw_data_path: Path = field(init=False)
+    files_path: Path = field(init=False)
     """The path where data can be found. Usually a folder"""
 
     done_key = "id"
@@ -55,23 +54,12 @@ class AlignmentDataset:
     batch_size = 20
     """The number of items to collect before flushing to the database."""
 
-    # Internal housekeeping variables
-    _entry_idx = 0
-    """Used internally for writing debugging info - each file write will increment it"""
-    _outputted_items = set()
-    """A set of the ids of all previously processed items"""
-    _: KW_ONLY
-    id_fields: List[str] = field(default_factory=lambda: ["url", "title"])
-    """A list of fields to use as the id of the entry. If not set, will use ['url', 'title']"""
-
     def __str__(self) -> str:
         return self.name
 
-    def __post_init__(self, data_path=Path(__file__).parent / "../../data/"):
-        self.data_path = data_path
+    def __post_init__(self, data_path: Optional[Path] = None):
+        self.data_path = data_path or (Path(__file__).parent / "../../data/").resolve()
         self.raw_data_path = self.data_path / "raw"
-
-        # set the default place to look for data
         self.files_path = self.raw_data_path / self.name
 
     def _add_authors(self, article: Article, authors: List[str]) -> Article:
@@ -81,57 +69,57 @@ def _add_authors(self, article: Article, authors: List[str]) -> Article:
             article.authors = ",".join(article.authors[:1024].split(",")[:-1])
         return article
 
-    def make_data_entry(self, data, **kwargs) -> Article:
+    def make_data_entry(self, data: Dict[str, Any], **kwargs) -> Article:
         data = dict(data, **kwargs)
         summary = data.pop("summary", None)
         authors = data.pop("authors", [])
 
         article = Article(
             id_fields=self.id_fields,
-            meta={k: v for k, v in data.items() if k not in INIT_DICT and v is not None},
-            **{k: v for k, v in data.items() if k in INIT_DICT},
+            meta={k: v for k, v in data.items() if k not in ARTICLE_MAIN_KEYS and v is not None},
+            **{k: v for k, v in data.items() if k in ARTICLE_MAIN_KEYS},
         )
         self._add_authors(article, authors)
         if summary:
             article.summaries.append(Summary(text=summary, source=self.name))
         return article
 
-    def to_jsonl(self, out_path=None, filename=None) -> Path:
-        if not out_path:
-            out_path = Path(__file__).parent / "../../data/"
-
-        if not filename:
-            filename = f"{self.name}.jsonl"
-        filename = Path(out_path) / filename
+    def to_jsonl(self, out_path: Path | None = None, filename: str | None = None) -> Path:
+        out_path = out_path or self.data_path
+        filename = filename or f"{self.name}.jsonl"
+        filepath = out_path / filename
 
-        with jsonlines.open(filename, "w") as jsonl_writer:
+        with jsonlines.open(filepath, "w") as jsonl_writer:
             for article in self.read_entries():
                 jsonl_writer.write(article.to_dict())
-        return filename.resolve()
+        return filepath.resolve()
 
     @property
-    def _query_items(self):
+    def _query_items(self) -> Select[Tuple[Article]]:
         return select(Article).where(Article.source == self.name)
 
-    def read_entries(self, sort_by=None):
+    def read_entries(self, sort_by=None) -> Iterable[Article]:
         """Iterate through all the saved entries."""
         with make_session() as session:
             query = self._query_items.options(joinedload(Article.summaries))
             if sort_by is not None:
                 query = query.order_by(sort_by)
-            for item in session.scalars(query).unique():
-                yield item
+
+            result = session.scalars(query)
+            for article in result.unique(): # removes duplicates
+                yield article
 
-    def _add_batch(self, session, batch):
+    def _add_batch(self, session: Session, batch):
         session.add_all(batch)
 
     def add_entries(self, entries):
-        def commit():
+        def commit() -> bool:
             try:
                 session.commit()
                 return True
             except IntegrityError:
                 session.rollback()
+                return False
 
         with make_session() as session:
             items = iter(entries)
@@ -161,7 +149,11 @@ def get_item_key(self, item):
         return item.name
 
     def _load_outputted_items(self) -> Set[str]:
-        """Load the output file (if it exists) in order to know which items have already been output."""
+        """
+        Loads the outputted items from the database and returns them as a set.
+
+        if the done_key is not an attribute of Article, it will try to load it from the meta field.
+        """
         with make_session() as session:
             if hasattr(Article, self.done_key):
                 # This doesn't filter by self.name. The good thing about that is that it should handle a lot more
@@ -170,10 +162,10 @@ def _load_outputted_items(self) -> Set[str]:
                 return set(
                     session.scalars(select(getattr(Article, self.done_key))).all()
                 )
-            # TODO: Properly handle this - it should create a proper SQL JSON select
             return {
-                item.get(self.done_key)
-                for item in session.scalars(select(Article.meta)).all()
+                meta[self.done_key]
+                for meta in session.scalars(select(Article.meta)).all()
+                if isinstance(meta, JSON) and meta.get(self.done_key)
             }
 
     def unprocessed_items(self, items=None) -> Iterable:
@@ -183,6 +175,7 @@ def unprocessed_items(self, items=None) -> Iterable:
         based on the contents of the output file.
         """
         self.setup()
+        items = items or self.items_list
 
         def not_processed(item):
             # NOTE: `self._outputted_items` reads in all items. Which could potentially be a lot. If this starts to
@@ -191,15 +184,15 @@ def not_processed(item):
             # If it get's to that level, consider batching it somehow
             return self.get_item_key(item) not in self._outputted_items
 
-        filtered = filter(not_processed, items or self.items_list)
+        items_to_process = filter(not_processed, items)
 
         # greedily fetch all items if not lazy eval. This makes the progress bar look nice
         if not self.lazy_eval:
-            filtered = list(filtered)
+            items_to_process = list(items_to_process)
 
-        return filtered
+        return items_to_process
 
-    def fetch_entries(self):
+    def fetch_entries(self) -> Article:
         """Get all entries to be written to the file."""
         for item in tqdm(self.unprocessed_items(), desc=f"Processing {self.name}"):
             entry = self.process_entry(item)
@@ -216,11 +209,11 @@ def process_entry(self, entry) -> Optional[Article]:
         raise NotImplementedError
 
     @staticmethod
-    def _format_datetime(date) -> str:
+    def _format_datetime(date: datetime) -> str:
         return date.strftime("%Y-%m-%dT%H:%M:%SZ")
 
     @staticmethod
-    def _get_published_date(date) -> Optional[datetime]:
+    def _get_published_date(date: str) -> Optional[datetime]:
         try:
             # Totally ignore any timezone info, forcing everything to UTC
             return parse(str(date)).replace(tzinfo=pytz.UTC)

diff --git a/align_data/common/html_dataset.py b/align_data/common/html_dataset.py
@@ -1,23 +1,22 @@
 import pytz
-import regex as re
 import logging
 from datetime import datetime
-from dateutil.parser import parse
-from dataclasses import dataclass, field, KW_ONLY
+from dataclasses import dataclass, field
 from urllib.parse import urljoin
 from typing import List
+import re
 
 import requests
 import feedparser
 from bs4 import BeautifulSoup
+from bs4.element import ResultSet, Tag
 from markdownify import markdownify
 
 from align_data.common.alignment_dataset import AlignmentDataset
-
 logger = logging.getLogger(__name__)
 
 
-@dataclass
+@dataclass()
 class HTMLDataset(AlignmentDataset):
     """
     Fetches articles from a different blog by collecting links to articles from an index page.
@@ -27,36 +26,35 @@ class HTMLDataset(AlignmentDataset):
     done_key = "url"
 
     authors: List[str] = field(default_factory=list)
-    _: KW_ONLY
-    source_key: str = None
-    summary_key: str = None
 
     item_selector = "article"
     title_selector = "article h1"
     text_selector = "article"
     source_type = "blog"
     ignored_selectors = []
 
-    def extract_authors(self, article):
+    def extract_authors(self, article): #TODO: make this work
         return self.authors
 
-    def get_item_key(self, item):
-        article_url = item.find_all("a")[0]["href"].split("?")[0]
-        return urljoin(self.url, article_url)
+
+    def get_item_key(self, item: Tag) -> str:
+        first_href = item.find("a")["href"]
+        href_base, *_ = first_href.split("?")
+        return urljoin(self.url, href_base)
 
     @property
-    def items_list(self):
+    def items_list(self) -> ResultSet[Tag]:
         logger.info(f"Fetching entries from {self.url}")
         response = requests.get(self.url, allow_redirects=True)
         soup = BeautifulSoup(response.content, "html.parser")
         articles = soup.select(self.item_selector)
         logger.info(f"Found {len(articles)} articles")
         return articles
 
-    def _extra_values(self, contents):
+    def _extra_values(self, contents: BeautifulSoup):
         return {}
 
-    def process_entry(self, article):
+    def process_entry(self, article: Tag):
         article_url = self.get_item_key(article)
         contents = self._get_contents(article_url)
 
@@ -79,7 +77,7 @@ def process_entry(self, article):
             }
         )
 
-    def _get_contents(self, url):
+    def _get_contents(self, url: str):
         logger.info("Fetching {}".format(url))
         resp = requests.get(url, allow_redirects=True)
         return BeautifulSoup(resp.content, "html.parser")
@@ -97,7 +95,7 @@ def _get_text(self, contents):
 
     def _find_date(self, items):
         for i in items:
-            if re.match("\w+ \d{1,2}, \d{4}", i.text):
+            if re.match(r"\w+ \d{1,2}, \d{4}", i.text):
                 return datetime.strptime(i.text, "%b %d, %Y").replace(tzinfo=pytz.UTC)
 
     def _extract_markdown(self, element):
@@ -132,7 +130,7 @@ def _get_text(self, item):
         text = item.get("content") and item["content"][0].get("value")
         return self._extract_markdown(text)
 
-    def _get_contents(self, url):
+    def _get_contents(self, url: str):
         item = self.items[url]
         if "content" in item:
             return item

diff --git a/align_data/common/utils.py b/align_data/common/utils.py