diff --git a/align_data/common/html_dataset.py b/align_data/common/html_dataset.py
index e5e4d277..950c0e55 100644
--- a/align_data/common/html_dataset.py
+++ b/align_data/common/html_dataset.py
@@ -81,7 +81,7 @@ def process_entry(self, article):
return self.make_data_entry(contents)
def fetch_contents(self, url):
- logger.info("Fetching {}".format(url))
+ logger.debug(f"Fetching {url}")
resp = requests.get(url, allow_redirects=True)
return BeautifulSoup(resp.content, "html.parser")
@@ -141,7 +141,7 @@ def fetch_contents(self, url):
if "content" in item:
return item
- logger.info("Fetching {}".format(url))
+ logger.debug(f"Fetching {url}")
resp = requests.get(url, allow_redirects=True)
soup = BeautifulSoup(resp.content, "html.parser")
return dict(
diff --git a/align_data/db/models.py b/align_data/db/models.py
index e79da232..36bd1ff3 100644
--- a/align_data/db/models.py
+++ b/align_data/db/models.py
@@ -82,7 +82,8 @@ class Article(Base):
)
def __repr__(self) -> str:
- return f"Article(id={self.id!r}, title={self.title!r}, url={self.url!r}, source={self.source!r}, authors={self.authors!r}, date_published={self.date_published!r})"
+ formatted_date = self.date_published.strftime('%Y-%m-%d %H:%M:%S%z')
+ return f"Article(id={self.id!r}, title={self.title!r}, url={self.url!r}, source={self.source!r}, authors={self.authors!r}, date_published={formatted_date!r})"
def generate_id_string(self) -> bytes:
return "".join(
diff --git a/align_data/sources/arbital/arbital.py b/align_data/sources/arbital/arbital.py
index b08393c4..47a87ef0 100644
--- a/align_data/sources/arbital/arbital.py
+++ b/align_data/sources/arbital/arbital.py
@@ -184,7 +184,7 @@ def process_entry(self, alias: str):
return self.make_data_entry(
{
- "title": page.get("title") or "",
+ "title": page.get("title") or None,
"text": text,
"date_published": self._get_published_date(page),
"url": f'https://arbital.com/p/{page.get("alias") or alias}',
diff --git a/align_data/sources/articles/articles.py b/align_data/sources/articles/articles.py
index 7db94a7b..953a3253 100644
--- a/align_data/sources/articles/articles.py
+++ b/align_data/sources/articles/articles.py
@@ -49,7 +49,7 @@ def save_pdf(filename, link):
@with_retry(times=3, exceptions=gspread.exceptions.APIError)
def process_row(row, sheets):
"""Check the given `row` and fetch its metadata + optional extra stuff."""
- logger.info('Checking "%s"', row["title"])
+ logger.debug('Checking "%s"', row["title"])
missing = [field for field in REQUIRED_FIELDS if not row.get(field)]
if missing:
@@ -91,7 +91,7 @@ def process_spreadsheets(source_sheet, output_sheets):
:param Worksheet source_sheet: the worksheet to be processed - each row should be a separate entry
:param Dict[str, Worksheet] output_sheets: a dict of per data type worksheets to be updated
"""
- logger.info("fetching seen urls")
+ logger.info("fetching seen urls in {output_sheets}")
seen = {
url
for sheet in output_sheets.values()
@@ -120,8 +120,8 @@ def update_new_items(source_spreadsheet, source_sheet, output_spreadsheet):
return process_spreadsheets(source_sheet, sheets)
-def check_new_articles(source_spreadsheet, source_sheet):
- """Goes through the special indices looking for unseen articles."""
+def check_new_articles(source_spreadsheet, source_sheet) -> int:
+ """Goes through the special indices looking for unseen articles to update. Returns the number of updated rows."""
source_sheet = get_sheet(source_spreadsheet, source_sheet)
current = {row.get("title"): row for row in iterate_rows(source_sheet)}
seen_urls = {
diff --git a/align_data/sources/articles/datasets.py b/align_data/sources/articles/datasets.py
index cbf7f9d9..b881f364 100644
--- a/align_data/sources/articles/datasets.py
+++ b/align_data/sources/articles/datasets.py
@@ -245,6 +245,6 @@ def get_contents(cls, item) -> Dict:
return contents
def process_entry(self, item):
- logger.info(f"Processing {item.title}")
+ logger.debug(f"Processing {item.title}")
return self.make_data_entry(self.get_contents(item), source=self.name)
diff --git a/align_data/sources/blogs/blogs.py b/align_data/sources/blogs/blogs.py
index 1245aec6..11267c2b 100644
--- a/align_data/sources/blogs/blogs.py
+++ b/align_data/sources/blogs/blogs.py
@@ -94,7 +94,7 @@ def items_list(self):
page = 1
with tqdm(desc=f"Loading {self.name} pages") as pbar:
while True:
- logger.info(f"Fetching entries from {self.url}")
+ logger.debug(f"Fetching entries from {self.url}")
response = requests.get(
self.url, allow_redirects=True, params={"73df3071_page": page}
)
diff --git a/align_data/sources/blogs/gwern_blog.py b/align_data/sources/blogs/gwern_blog.py
index 1d573a8e..f3a82882 100644
--- a/align_data/sources/blogs/gwern_blog.py
+++ b/align_data/sources/blogs/gwern_blog.py
@@ -71,7 +71,7 @@ def extract(item):
return dict(filter(None, map(extract, header.splitlines())))
def _get_article(self, url):
- logger.info("Fetching {}".format(url))
+ logger.debug(f"Fetching {url}")
return requests.get(url, allow_redirects=True)
@staticmethod
diff --git a/align_data/sources/blogs/wp_blog.py b/align_data/sources/blogs/wp_blog.py
index cd409d98..b7a60ef7 100644
--- a/align_data/sources/blogs/wp_blog.py
+++ b/align_data/sources/blogs/wp_blog.py
@@ -28,7 +28,7 @@ def items_list(self):
with tqdm(desc=f"Loading {self.name} pages") as pbar:
while True:
paged_url = f"{self.feed_url}?paged={page_number}"
- logging.info(f"Fetching {paged_url}")
+ logger.debug(f"Fetching {paged_url}")
feed = feedparser.parse(paged_url)
title = feed.get("feed", {}).get("title")
diff --git a/align_data/sources/ebooks/agentmodels.py b/align_data/sources/ebooks/agentmodels.py
index 65b52502..8915fbe3 100644
--- a/align_data/sources/ebooks/agentmodels.py
+++ b/align_data/sources/ebooks/agentmodels.py
@@ -21,7 +21,7 @@ def setup(self):
super().setup()
self.base_dir = self.raw_data_path / "agentmodels.org"
if not self.base_dir.exists() or not list(self.base_dir.glob("*")):
- logger.info("Cloning repo")
+ logger.info(f"Cloning repo {self.repo}")
Repo.clone_from(self.repo, self.base_dir)
self.repository = Repo(self.base_dir)
self.files_path = self.base_dir / "chapters"
diff --git a/align_data/sources/stampy/stampy.py b/align_data/sources/stampy/stampy.py
index 95319820..5dc24cf0 100644
--- a/align_data/sources/stampy/stampy.py
+++ b/align_data/sources/stampy/stampy.py
@@ -49,7 +49,7 @@ def clean_text(text):
answer = clean_text(entry["Rich Text"])
url = "https://aisafety.info?state=" + entry["UI ID"]
- logger.info(f"Processing {question}")
+ logger.debug(f"Processing {question}")
return self.make_data_entry(
{
diff --git a/main.py b/main.py
index 82c30f07..c047ce9b 100644
--- a/main.py
+++ b/main.py
@@ -60,7 +60,7 @@ def fetch_all(self, *skip) -> None:
"""
names = [name for name in ALL_DATASETS if name not in skip]
for name in names:
- print(name)
+ logger.debug(name)
self.fetch(name)
def generate_jsonl_files(self, *names):
@@ -74,7 +74,7 @@ def generate_jsonl_files(self, *names):
assert not missing, f"{missing} are not valid dataset names"
for name in names:
dataset = get_dataset(name)
- print(dataset.to_jsonl())
+ logger.info(dataset.to_jsonl())
def count_tokens(self, merged_dataset_path: str) -> None:
"""