diff --git a/align_data/common/html_dataset.py b/align_data/common/html_dataset.py index e5e4d277..950c0e55 100644 --- a/align_data/common/html_dataset.py +++ b/align_data/common/html_dataset.py @@ -81,7 +81,7 @@ def process_entry(self, article): return self.make_data_entry(contents) def fetch_contents(self, url): - logger.info("Fetching {}".format(url)) + logger.debug(f"Fetching {url}") resp = requests.get(url, allow_redirects=True) return BeautifulSoup(resp.content, "html.parser") @@ -141,7 +141,7 @@ def fetch_contents(self, url): if "content" in item: return item - logger.info("Fetching {}".format(url)) + logger.debug(f"Fetching {url}") resp = requests.get(url, allow_redirects=True) soup = BeautifulSoup(resp.content, "html.parser") return dict( diff --git a/align_data/db/models.py b/align_data/db/models.py index e79da232..36bd1ff3 100644 --- a/align_data/db/models.py +++ b/align_data/db/models.py @@ -82,7 +82,8 @@ class Article(Base): ) def __repr__(self) -> str: - return f"Article(id={self.id!r}, title={self.title!r}, url={self.url!r}, source={self.source!r}, authors={self.authors!r}, date_published={self.date_published!r})" + formatted_date = self.date_published.strftime('%Y-%m-%d %H:%M:%S%z') + return f"Article(id={self.id!r}, title={self.title!r}, url={self.url!r}, source={self.source!r}, authors={self.authors!r}, date_published={formatted_date!r})" def generate_id_string(self) -> bytes: return "".join( diff --git a/align_data/sources/arbital/arbital.py b/align_data/sources/arbital/arbital.py index b08393c4..47a87ef0 100644 --- a/align_data/sources/arbital/arbital.py +++ b/align_data/sources/arbital/arbital.py @@ -184,7 +184,7 @@ def process_entry(self, alias: str): return self.make_data_entry( { - "title": page.get("title") or "", + "title": page.get("title") or None, "text": text, "date_published": self._get_published_date(page), "url": f'https://arbital.com/p/{page.get("alias") or alias}', diff --git a/align_data/sources/articles/articles.py b/align_data/sources/articles/articles.py index 7db94a7b..953a3253 100644 --- a/align_data/sources/articles/articles.py +++ b/align_data/sources/articles/articles.py @@ -49,7 +49,7 @@ def save_pdf(filename, link): @with_retry(times=3, exceptions=gspread.exceptions.APIError) def process_row(row, sheets): """Check the given `row` and fetch its metadata + optional extra stuff.""" - logger.info('Checking "%s"', row["title"]) + logger.debug('Checking "%s"', row["title"]) missing = [field for field in REQUIRED_FIELDS if not row.get(field)] if missing: @@ -91,7 +91,7 @@ def process_spreadsheets(source_sheet, output_sheets): :param Worksheet source_sheet: the worksheet to be processed - each row should be a separate entry :param Dict[str, Worksheet] output_sheets: a dict of per data type worksheets to be updated """ - logger.info("fetching seen urls") + logger.info("fetching seen urls in {output_sheets}") seen = { url for sheet in output_sheets.values() @@ -120,8 +120,8 @@ def update_new_items(source_spreadsheet, source_sheet, output_spreadsheet): return process_spreadsheets(source_sheet, sheets) -def check_new_articles(source_spreadsheet, source_sheet): - """Goes through the special indices looking for unseen articles.""" +def check_new_articles(source_spreadsheet, source_sheet) -> int: + """Goes through the special indices looking for unseen articles to update. Returns the number of updated rows.""" source_sheet = get_sheet(source_spreadsheet, source_sheet) current = {row.get("title"): row for row in iterate_rows(source_sheet)} seen_urls = { diff --git a/align_data/sources/articles/datasets.py b/align_data/sources/articles/datasets.py index cbf7f9d9..b881f364 100644 --- a/align_data/sources/articles/datasets.py +++ b/align_data/sources/articles/datasets.py @@ -245,6 +245,6 @@ def get_contents(cls, item) -> Dict: return contents def process_entry(self, item): - logger.info(f"Processing {item.title}") + logger.debug(f"Processing {item.title}") return self.make_data_entry(self.get_contents(item), source=self.name) diff --git a/align_data/sources/blogs/blogs.py b/align_data/sources/blogs/blogs.py index 1245aec6..11267c2b 100644 --- a/align_data/sources/blogs/blogs.py +++ b/align_data/sources/blogs/blogs.py @@ -94,7 +94,7 @@ def items_list(self): page = 1 with tqdm(desc=f"Loading {self.name} pages") as pbar: while True: - logger.info(f"Fetching entries from {self.url}") + logger.debug(f"Fetching entries from {self.url}") response = requests.get( self.url, allow_redirects=True, params={"73df3071_page": page} ) diff --git a/align_data/sources/blogs/gwern_blog.py b/align_data/sources/blogs/gwern_blog.py index 1d573a8e..f3a82882 100644 --- a/align_data/sources/blogs/gwern_blog.py +++ b/align_data/sources/blogs/gwern_blog.py @@ -71,7 +71,7 @@ def extract(item): return dict(filter(None, map(extract, header.splitlines()))) def _get_article(self, url): - logger.info("Fetching {}".format(url)) + logger.debug(f"Fetching {url}") return requests.get(url, allow_redirects=True) @staticmethod diff --git a/align_data/sources/blogs/wp_blog.py b/align_data/sources/blogs/wp_blog.py index cd409d98..b7a60ef7 100644 --- a/align_data/sources/blogs/wp_blog.py +++ b/align_data/sources/blogs/wp_blog.py @@ -28,7 +28,7 @@ def items_list(self): with tqdm(desc=f"Loading {self.name} pages") as pbar: while True: paged_url = f"{self.feed_url}?paged={page_number}" - logging.info(f"Fetching {paged_url}") + logger.debug(f"Fetching {paged_url}") feed = feedparser.parse(paged_url) title = feed.get("feed", {}).get("title") diff --git a/align_data/sources/ebooks/agentmodels.py b/align_data/sources/ebooks/agentmodels.py index 65b52502..8915fbe3 100644 --- a/align_data/sources/ebooks/agentmodels.py +++ b/align_data/sources/ebooks/agentmodels.py @@ -21,7 +21,7 @@ def setup(self): super().setup() self.base_dir = self.raw_data_path / "agentmodels.org" if not self.base_dir.exists() or not list(self.base_dir.glob("*")): - logger.info("Cloning repo") + logger.info(f"Cloning repo {self.repo}") Repo.clone_from(self.repo, self.base_dir) self.repository = Repo(self.base_dir) self.files_path = self.base_dir / "chapters" diff --git a/align_data/sources/stampy/stampy.py b/align_data/sources/stampy/stampy.py index 95319820..5dc24cf0 100644 --- a/align_data/sources/stampy/stampy.py +++ b/align_data/sources/stampy/stampy.py @@ -49,7 +49,7 @@ def clean_text(text): answer = clean_text(entry["Rich Text"]) url = "https://aisafety.info?state=" + entry["UI ID"] - logger.info(f"Processing {question}") + logger.debug(f"Processing {question}") return self.make_data_entry( { diff --git a/main.py b/main.py index 82c30f07..c047ce9b 100644 --- a/main.py +++ b/main.py @@ -60,7 +60,7 @@ def fetch_all(self, *skip) -> None: """ names = [name for name in ALL_DATASETS if name not in skip] for name in names: - print(name) + logger.debug(name) self.fetch(name) def generate_jsonl_files(self, *names): @@ -74,7 +74,7 @@ def generate_jsonl_files(self, *names): assert not missing, f"{missing} are not valid dataset names" for name in names: dataset = get_dataset(name) - print(dataset.to_jsonl()) + logger.info(dataset.to_jsonl()) def count_tokens(self, merged_dataset_path: str) -> None: """