Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix title and url for agentmodels #187

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 51 additions & 12 deletions align_data/sources/ebooks/agentmodels.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
from align_data.common.alignment_dataset import AlignmentDataset
from pathlib import Path
from dataclasses import dataclass
from git import Repo
import logging
from datetime import timezone
from datetime import datetime, timezone

from align_data.common.alignment_dataset import AlignmentDataset
from git import Repo

logger = logging.getLogger(__name__)

Expand All @@ -22,15 +24,52 @@ def setup(self):
self.base_dir = self.raw_data_path / "agentmodels.org"
if not self.base_dir.exists() or not list(self.base_dir.glob("*")):
logger.info("Cloning repo")
Repo.clone_from(self.repo, self.base_dir)
Repo.clone_from(url=self.repo, to_path=self.base_dir)
self.repository = Repo(self.base_dir)
self.files_path = self.base_dir / "chapters"

def _get_published_date(self, filename):
last_commit = next(self.repository.iter_commits(paths=f"chapters/{filename.name}"))
@property
def items_list(self):
return self.files_path.iterdir()

def _get_published_date(self, filepath: Path) -> datetime:
last_commit = next(self.repository.iter_commits(paths=f"chapters/{filepath.name}"))
return last_commit.committed_datetime.astimezone(timezone.utc)

def _get_title(self, filepath: Path) -> str | None:
"""
Receives a filepath, and retrieves the title.
Examples:
if filepath.stem: 6-efficient-inference
then title: Modeling Agents with Probabilistic Programs - Chapter 6: Efficient Inference"

if filepath.stem: 2-webppl
then title: Modeling Agents with Probabilistic Programs - Chapter 2: Webppl"
"""
if filepath.stem[:1].isnumeric():
chapter_num, chapter_name = filepath.stem.split("-", 1)
chapter_name = chapter_name.replace('-', ' ').capitalize()
return f"Modeling Agents with Probabilistic Programs - Chapter {chapter_num}: {chapter_name}"
chapter_name = filepath.stem.replace('-', ' ').capitalize()
return f"Modeling Agents with Probabilistic Programs - {chapter_name}"

def _get_url(self, filepath: Path) -> str | None:
"""
Receives a filepath and retrieves the url.
Examples:
if filepath.stem: 6-efficient-inference
then url: https://agentmodels.org/chapters/6-efficient-inference.html"

if filepath.stem: .3d-something
then url: None
"""
if filepath.stem.startswith('.'):
return None # unusual file
#TODO: The website has "hidden" the pages for chapter 6 (filepath.stem.startswith("6")), so the
# link doesn't point to the actual text of this chapter. To fix.
return f"https://agentmodels.org/chapters/{filepath.stem}.html"

def process_entry(self, filename):
def process_entry(self, filepath):
return self.make_data_entry(
{
"source": self.name,
Expand All @@ -41,10 +80,10 @@ def process_entry(self, filename):
"John Salvatier",
"Daniel Filan",
],
"date_published": self._get_published_date(filename),
"title": "Modeling Agents with Probabilistic Programs",
"url": f"https://agentmodels.org/chapters/{filename.stem}.html",
"filename": filename.name,
"text": filename.read_text(encoding="utf-8"),
"date_published": self._get_published_date(filepath),
"title": self._get_title(filepath),
"url": self._get_url(filepath),
"filename": filepath.name,
"text": filepath.read_text(encoding="utf-8"),
}
)
Loading