Skip to content

Commit

Permalink
add debug logging for logging that takes up a lot of space
Browse files Browse the repository at this point in the history
  • Loading branch information
Thomas-Lemoine committed Sep 11, 2023
1 parent f08b8b5 commit 7dfd81c
Show file tree
Hide file tree
Showing 11 changed files with 17 additions and 16 deletions.
4 changes: 2 additions & 2 deletions align_data/common/html_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def process_entry(self, article):
return self.make_data_entry(contents)

def fetch_contents(self, url):
logger.info("Fetching {}".format(url))
logger.debug(f"Fetching {url}")
resp = requests.get(url, allow_redirects=True)
return BeautifulSoup(resp.content, "html.parser")

Expand Down Expand Up @@ -141,7 +141,7 @@ def fetch_contents(self, url):
if "content" in item:
return item

logger.info("Fetching {}".format(url))
logger.debug(f"Fetching {url}")
resp = requests.get(url, allow_redirects=True)
soup = BeautifulSoup(resp.content, "html.parser")
return dict(
Expand Down
3 changes: 2 additions & 1 deletion align_data/db/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,8 @@ class Article(Base):
)

def __repr__(self) -> str:
return f"Article(id={self.id!r}, title={self.title!r}, url={self.url!r}, source={self.source!r}, authors={self.authors!r}, date_published={self.date_published!r})"
formatted_date = self.date_published.strftime('%Y-%m-%d %H:%M:%S%z')
return f"Article(id={self.id!r}, title={self.title!r}, url={self.url!r}, source={self.source!r}, authors={self.authors!r}, date_published={formatted_date!r})"

def generate_id_string(self) -> bytes:
return "".join(
Expand Down
2 changes: 1 addition & 1 deletion align_data/sources/arbital/arbital.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,7 @@ def process_entry(self, alias: str):

return self.make_data_entry(
{
"title": page.get("title") or "",
"title": page.get("title") or None,
"text": text,
"date_published": self._get_published_date(page),
"url": f'https://arbital.com/p/{page.get("alias") or alias}',
Expand Down
8 changes: 4 additions & 4 deletions align_data/sources/articles/articles.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def save_pdf(filename, link):
@with_retry(times=3, exceptions=gspread.exceptions.APIError)
def process_row(row, sheets):
"""Check the given `row` and fetch its metadata + optional extra stuff."""
logger.info('Checking "%s"', row["title"])
logger.debug('Checking "%s"', row["title"])

missing = [field for field in REQUIRED_FIELDS if not row.get(field)]
if missing:
Expand Down Expand Up @@ -91,7 +91,7 @@ def process_spreadsheets(source_sheet, output_sheets):
:param Worksheet source_sheet: the worksheet to be processed - each row should be a separate entry
:param Dict[str, Worksheet] output_sheets: a dict of per data type worksheets to be updated
"""
logger.info("fetching seen urls")
logger.info("fetching seen urls in {output_sheets}")
seen = {
url
for sheet in output_sheets.values()
Expand Down Expand Up @@ -120,8 +120,8 @@ def update_new_items(source_spreadsheet, source_sheet, output_spreadsheet):
return process_spreadsheets(source_sheet, sheets)


def check_new_articles(source_spreadsheet, source_sheet):
"""Goes through the special indices looking for unseen articles."""
def check_new_articles(source_spreadsheet, source_sheet) -> int:
"""Goes through the special indices looking for unseen articles to update. Returns the number of updated rows."""
source_sheet = get_sheet(source_spreadsheet, source_sheet)
current = {row.get("title"): row for row in iterate_rows(source_sheet)}
seen_urls = {
Expand Down
2 changes: 1 addition & 1 deletion align_data/sources/articles/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,6 +245,6 @@ def get_contents(cls, item) -> Dict:
return contents

def process_entry(self, item):
logger.info(f"Processing {item.title}")
logger.debug(f"Processing {item.title}")

return self.make_data_entry(self.get_contents(item), source=self.name)
2 changes: 1 addition & 1 deletion align_data/sources/blogs/blogs.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ def items_list(self):
page = 1
with tqdm(desc=f"Loading {self.name} pages") as pbar:
while True:
logger.info(f"Fetching entries from {self.url}")
logger.debug(f"Fetching entries from {self.url}")
response = requests.get(
self.url, allow_redirects=True, params={"73df3071_page": page}
)
Expand Down
2 changes: 1 addition & 1 deletion align_data/sources/blogs/gwern_blog.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def extract(item):
return dict(filter(None, map(extract, header.splitlines())))

def _get_article(self, url):
logger.info("Fetching {}".format(url))
logger.debug(f"Fetching {url}")
return requests.get(url, allow_redirects=True)

@staticmethod
Expand Down
2 changes: 1 addition & 1 deletion align_data/sources/blogs/wp_blog.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def items_list(self):
with tqdm(desc=f"Loading {self.name} pages") as pbar:
while True:
paged_url = f"{self.feed_url}?paged={page_number}"
logging.info(f"Fetching {paged_url}")
logger.debug(f"Fetching {paged_url}")

feed = feedparser.parse(paged_url)
title = feed.get("feed", {}).get("title")
Expand Down
2 changes: 1 addition & 1 deletion align_data/sources/ebooks/agentmodels.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def setup(self):
super().setup()
self.base_dir = self.raw_data_path / "agentmodels.org"
if not self.base_dir.exists() or not list(self.base_dir.glob("*")):
logger.info("Cloning repo")
logger.info(f"Cloning repo {self.repo}")
Repo.clone_from(self.repo, self.base_dir)
self.repository = Repo(self.base_dir)
self.files_path = self.base_dir / "chapters"
Expand Down
2 changes: 1 addition & 1 deletion align_data/sources/stampy/stampy.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def clean_text(text):
answer = clean_text(entry["Rich Text"])
url = "https://aisafety.info?state=" + entry["UI ID"]

logger.info(f"Processing {question}")
logger.debug(f"Processing {question}")

return self.make_data_entry(
{
Expand Down
4 changes: 2 additions & 2 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def fetch_all(self, *skip) -> None:
"""
names = [name for name in ALL_DATASETS if name not in skip]
for name in names:
print(name)
logger.debug(name)
self.fetch(name)

def generate_jsonl_files(self, *names):
Expand All @@ -74,7 +74,7 @@ def generate_jsonl_files(self, *names):
assert not missing, f"{missing} are not valid dataset names"
for name in names:
dataset = get_dataset(name)
print(dataset.to_jsonl())
logger.info(dataset.to_jsonl())

def count_tokens(self, merged_dataset_path: str) -> None:
"""
Expand Down

0 comments on commit 7dfd81c

Please sign in to comment.