From b44afcfeac9b37a561d89cf71045400cc590d354 Mon Sep 17 00:00:00 2001
From: Thomas Lemoine <lemoine123thomas@gmail.com>
Date: Sun, 10 Sep 2023 13:32:40 -0400
Subject: [PATCH] fix title and url for agentmodels

---
 align_data/sources/ebooks/agentmodels.py | 63 +++++++++++++++++++-----
 1 file changed, 51 insertions(+), 12 deletions(-)

diff --git a/align_data/sources/ebooks/agentmodels.py b/align_data/sources/ebooks/agentmodels.py
index 65b52502..3c983dd1 100644
--- a/align_data/sources/ebooks/agentmodels.py
+++ b/align_data/sources/ebooks/agentmodels.py
@@ -1,8 +1,10 @@
-from align_data.common.alignment_dataset import AlignmentDataset
+from pathlib import Path
 from dataclasses import dataclass
-from git import Repo
 import logging
-from datetime import timezone
+from datetime import datetime, timezone
+
+from align_data.common.alignment_dataset import AlignmentDataset
+from git import Repo
 
 logger = logging.getLogger(__name__)
 
@@ -22,15 +24,52 @@ def setup(self):
         self.base_dir = self.raw_data_path / "agentmodels.org"
         if not self.base_dir.exists() or not list(self.base_dir.glob("*")):
             logger.info("Cloning repo")
-            Repo.clone_from(self.repo, self.base_dir)
+            Repo.clone_from(url=self.repo, to_path=self.base_dir)
         self.repository = Repo(self.base_dir)
         self.files_path = self.base_dir / "chapters"
 
-    def _get_published_date(self, filename):
-        last_commit = next(self.repository.iter_commits(paths=f"chapters/{filename.name}"))
+    @property
+    def items_list(self):
+        return self.files_path.iterdir()
+
+    def _get_published_date(self, filepath: Path) -> datetime:
+        last_commit = next(self.repository.iter_commits(paths=f"chapters/{filepath.name}"))
         return last_commit.committed_datetime.astimezone(timezone.utc)
+    
+    def _get_title(self, filepath: Path) -> str | None:
+        """
+        Receives a filepath, and retrieves the title.
+        Examples:
+            if filepath.stem: 6-efficient-inference
+            then title: Modeling Agents with Probabilistic Programs - Chapter 6: Efficient Inference"
+
+            if filepath.stem: 2-webppl
+            then title: Modeling Agents with Probabilistic Programs - Chapter 2: Webppl"
+        """
+        if filepath.stem[:1].isnumeric():
+            chapter_num, chapter_name = filepath.stem.split("-", 1)
+            chapter_name = chapter_name.replace('-', ' ').capitalize()
+            return f"Modeling Agents with Probabilistic Programs - Chapter {chapter_num}: {chapter_name}"
+        chapter_name = filepath.stem.replace('-', ' ').capitalize()
+        return f"Modeling Agents with Probabilistic Programs - {chapter_name}"
+
+    def _get_url(self, filepath: Path) -> str | None:
+        """
+        Receives a filepath and retrieves the url.
+        Examples:
+            if filepath.stem: 6-efficient-inference
+            then url: https://agentmodels.org/chapters/6-efficient-inference.html"
+
+            if filepath.stem: .3d-something
+            then url: None
+        """
+        if filepath.stem.startswith('.'):
+            return None # unusual file
+        #TODO: The website has "hidden" the pages for chapter 6 (filepath.stem.startswith("6")), so the
+        # link doesn't point to the actual text of this chapter. To fix.
+        return f"https://agentmodels.org/chapters/{filepath.stem}.html"
 
-    def process_entry(self, filename):
+    def process_entry(self, filepath):
         return self.make_data_entry(
             {
                 "source": self.name,
@@ -41,10 +80,10 @@ def process_entry(self, filename):
                     "John Salvatier",
                     "Daniel Filan",
                 ],
-                "date_published": self._get_published_date(filename),
-                "title": "Modeling Agents with Probabilistic Programs",
-                "url": f"https://agentmodels.org/chapters/{filename.stem}.html",
-                "filename": filename.name,
-                "text": filename.read_text(encoding="utf-8"),
+                "date_published": self._get_published_date(filepath),
+                "title": self._get_title(filepath),
+                "url": self._get_url(filepath),
+                "filename": filepath.name,
+                "text": filepath.read_text(encoding="utf-8"),
             }
         )