Skip to content

Commit

Permalink
fix title and url for agentmodels
Browse files Browse the repository at this point in the history
  • Loading branch information
Thomas-Lemoine committed Sep 10, 2023
1 parent af6f577 commit b44afcf
Showing 1 changed file with 51 additions and 12 deletions.
63 changes: 51 additions & 12 deletions align_data/sources/ebooks/agentmodels.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
from align_data.common.alignment_dataset import AlignmentDataset
from pathlib import Path
from dataclasses import dataclass
from git import Repo
import logging
from datetime import timezone
from datetime import datetime, timezone

from align_data.common.alignment_dataset import AlignmentDataset
from git import Repo

logger = logging.getLogger(__name__)

Expand All @@ -22,15 +24,52 @@ def setup(self):
self.base_dir = self.raw_data_path / "agentmodels.org"
if not self.base_dir.exists() or not list(self.base_dir.glob("*")):
logger.info("Cloning repo")
Repo.clone_from(self.repo, self.base_dir)
Repo.clone_from(url=self.repo, to_path=self.base_dir)
self.repository = Repo(self.base_dir)
self.files_path = self.base_dir / "chapters"

def _get_published_date(self, filename):
last_commit = next(self.repository.iter_commits(paths=f"chapters/{filename.name}"))
@property
def items_list(self):
return self.files_path.iterdir()

def _get_published_date(self, filepath: Path) -> datetime:
last_commit = next(self.repository.iter_commits(paths=f"chapters/{filepath.name}"))
return last_commit.committed_datetime.astimezone(timezone.utc)

def _get_title(self, filepath: Path) -> str | None:
"""
Receives a filepath, and retrieves the title.
Examples:
if filepath.stem: 6-efficient-inference
then title: Modeling Agents with Probabilistic Programs - Chapter 6: Efficient Inference"
if filepath.stem: 2-webppl
then title: Modeling Agents with Probabilistic Programs - Chapter 2: Webppl"
"""
if filepath.stem[:1].isnumeric():
chapter_num, chapter_name = filepath.stem.split("-", 1)
chapter_name = chapter_name.replace('-', ' ').capitalize()
return f"Modeling Agents with Probabilistic Programs - Chapter {chapter_num}: {chapter_name}"
chapter_name = filepath.stem.replace('-', ' ').capitalize()
return f"Modeling Agents with Probabilistic Programs - {chapter_name}"

def _get_url(self, filepath: Path) -> str | None:
"""
Receives a filepath and retrieves the url.
Examples:
if filepath.stem: 6-efficient-inference
then url: https://agentmodels.org/chapters/6-efficient-inference.html"
if filepath.stem: .3d-something
then url: None
"""
if filepath.stem.startswith('.'):
return None # unusual file
#TODO: The website has "hidden" the pages for chapter 6 (filepath.stem.startswith("6")), so the
# link doesn't point to the actual text of this chapter. To fix.
return f"https://agentmodels.org/chapters/{filepath.stem}.html"

def process_entry(self, filename):
def process_entry(self, filepath):
return self.make_data_entry(
{
"source": self.name,
Expand All @@ -41,10 +80,10 @@ def process_entry(self, filename):
"John Salvatier",
"Daniel Filan",
],
"date_published": self._get_published_date(filename),
"title": "Modeling Agents with Probabilistic Programs",
"url": f"https://agentmodels.org/chapters/{filename.stem}.html",
"filename": filename.name,
"text": filename.read_text(encoding="utf-8"),
"date_published": self._get_published_date(filepath),
"title": self._get_title(filepath),
"url": self._get_url(filepath),
"filename": filepath.name,
"text": filepath.read_text(encoding="utf-8"),
}
)

0 comments on commit b44afcf

Please sign in to comment.