From b44afcfeac9b37a561d89cf71045400cc590d354 Mon Sep 17 00:00:00 2001 From: Thomas Lemoine Date: Sun, 10 Sep 2023 13:32:40 -0400 Subject: [PATCH] fix title and url for agentmodels --- align_data/sources/ebooks/agentmodels.py | 63 +++++++++++++++++++----- 1 file changed, 51 insertions(+), 12 deletions(-) diff --git a/align_data/sources/ebooks/agentmodels.py b/align_data/sources/ebooks/agentmodels.py index 65b52502..3c983dd1 100644 --- a/align_data/sources/ebooks/agentmodels.py +++ b/align_data/sources/ebooks/agentmodels.py @@ -1,8 +1,10 @@ -from align_data.common.alignment_dataset import AlignmentDataset +from pathlib import Path from dataclasses import dataclass -from git import Repo import logging -from datetime import timezone +from datetime import datetime, timezone + +from align_data.common.alignment_dataset import AlignmentDataset +from git import Repo logger = logging.getLogger(__name__) @@ -22,15 +24,52 @@ def setup(self): self.base_dir = self.raw_data_path / "agentmodels.org" if not self.base_dir.exists() or not list(self.base_dir.glob("*")): logger.info("Cloning repo") - Repo.clone_from(self.repo, self.base_dir) + Repo.clone_from(url=self.repo, to_path=self.base_dir) self.repository = Repo(self.base_dir) self.files_path = self.base_dir / "chapters" - def _get_published_date(self, filename): - last_commit = next(self.repository.iter_commits(paths=f"chapters/{filename.name}")) + @property + def items_list(self): + return self.files_path.iterdir() + + def _get_published_date(self, filepath: Path) -> datetime: + last_commit = next(self.repository.iter_commits(paths=f"chapters/{filepath.name}")) return last_commit.committed_datetime.astimezone(timezone.utc) + + def _get_title(self, filepath: Path) -> str | None: + """ + Receives a filepath, and retrieves the title. + Examples: + if filepath.stem: 6-efficient-inference + then title: Modeling Agents with Probabilistic Programs - Chapter 6: Efficient Inference" + + if filepath.stem: 2-webppl + then title: Modeling Agents with Probabilistic Programs - Chapter 2: Webppl" + """ + if filepath.stem[:1].isnumeric(): + chapter_num, chapter_name = filepath.stem.split("-", 1) + chapter_name = chapter_name.replace('-', ' ').capitalize() + return f"Modeling Agents with Probabilistic Programs - Chapter {chapter_num}: {chapter_name}" + chapter_name = filepath.stem.replace('-', ' ').capitalize() + return f"Modeling Agents with Probabilistic Programs - {chapter_name}" + + def _get_url(self, filepath: Path) -> str | None: + """ + Receives a filepath and retrieves the url. + Examples: + if filepath.stem: 6-efficient-inference + then url: https://agentmodels.org/chapters/6-efficient-inference.html" + + if filepath.stem: .3d-something + then url: None + """ + if filepath.stem.startswith('.'): + return None # unusual file + #TODO: The website has "hidden" the pages for chapter 6 (filepath.stem.startswith("6")), so the + # link doesn't point to the actual text of this chapter. To fix. + return f"https://agentmodels.org/chapters/{filepath.stem}.html" - def process_entry(self, filename): + def process_entry(self, filepath): return self.make_data_entry( { "source": self.name, @@ -41,10 +80,10 @@ def process_entry(self, filename): "John Salvatier", "Daniel Filan", ], - "date_published": self._get_published_date(filename), - "title": "Modeling Agents with Probabilistic Programs", - "url": f"https://agentmodels.org/chapters/{filename.stem}.html", - "filename": filename.name, - "text": filename.read_text(encoding="utf-8"), + "date_published": self._get_published_date(filepath), + "title": self._get_title(filepath), + "url": self._get_url(filepath), + "filename": filepath.name, + "text": filepath.read_text(encoding="utf-8"), } )