Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Article checker #182

Merged
merged 2 commits into from
Oct 26, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 65 additions & 0 deletions .github/workflows/check-articles.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
name: Check articles are valid

on:
workflow_call:
inputs:
datasource:
type: string
required: true
workflow_dispatch: # allow manual triggering
inputs:
datasource:
description: 'The datasource to process'
type: choice
options:
- all
- agentmodels
- agisf
- aisafety.info
- alignment_newsletter
- alignmentforum
- arbital
- arxiv
- blogs
- distill
- eaforum
- indices
- lesswrong
- special_docs
- youtube
schedule:
- cron: "0 */4 * * *" # Every 4 hours

jobs:
build-dataset:
runs-on: ubuntu-latest

steps:
- name: Checkout repository
uses: actions/checkout@v2

- name: Setup Python environment
uses: actions/setup-python@v2
with:
python-version: '3.x'

- name: Install Pandoc
run: |
if [ "${{ inputs.datasource }}" = "gdocs" ]; then
sudo apt-get update
sudo apt-get -y install pandoc
fi
- name: Install dependencies
run: pip install -r requirements.txt

- name: Process dataset
env:
CODA_TOKEN: ${{ secrets.CODA_TOKEN }}
AIRTABLE_API_KEY: ${{ secrets.AIRTABLE_API_KEY }}
YOUTUBE_API_KEY: ${{ secrets.YOUTUBE_API_KEY }}
ARD_DB_USER: ${{ secrets.ARD_DB_USER }}
ARD_DB_PASSWORD: ${{ secrets.ARD_DB_PASSWORD }}
ARD_DB_HOST: ${{ secrets.ARD_DB_HOST }}
ARD_DB_NAME: alignment_research_dataset
run: python main.py fetch ${{ inputs.datasource }}
70 changes: 13 additions & 57 deletions align_data/common/alignment_dataset.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
import re
from datetime import datetime
from itertools import islice
import logging
import time
from dataclasses import dataclass, field, KW_ONLY
from pathlib import Path
from typing import List, Optional, Set, Iterable, Tuple, Generator
from typing import Iterable, List, Optional, Set, Tuple, Generator
from sqlalchemy import select
from sqlalchemy.exc import IntegrityError
from sqlalchemy.orm import joinedload

import pytz
from sqlalchemy import select, Select, JSON
from sqlalchemy import select, Select
from sqlalchemy.exc import IntegrityError
from sqlalchemy.orm import joinedload, Session
import jsonlines
Expand All @@ -17,8 +19,7 @@

from align_data.db.models import Article, Summary
from align_data.db.session import make_session
from align_data.settings import ARTICLE_MAIN_KEYS
from align_data.sources.utils import merge_dicts
from align_data.common.formatters import normalize_url, article_dict

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -64,28 +65,10 @@ def __post_init__(self):
def __str__(self) -> str:
return self.name

def _add_authors(self, article: Article, authors: List[str]) -> Article:
# TODO: Don't keep adding the same authors - come up with some way to reuse them
article.authors = ",".join(authors)
if len(article.authors) > 1024:
article.authors = ",".join(article.authors[:1024].split(",")[:-1])
return article

def make_data_entry(self, data, **kwargs) -> Article:
data = merge_dicts(data, kwargs)

summaries = data.pop("summaries", [])
summary = data.pop("summary", None)
summaries += [summary] if summary else []

authors = data.pop("authors", [])
data['title'] = (data.get('title') or '').replace('\n', ' ').replace('\r', '') or None

article = Article(
meta={k: v for k, v in data.items() if k not in ARTICLE_MAIN_KEYS and v is not None},
**{k: v for k, v in data.items() if k in ARTICLE_MAIN_KEYS},
)
self._add_authors(article, authors)
data = article_dict(data, **kwargs)
summaries = data.pop('summaries', [])
article = Article(**data)
article.summaries += [Summary(text=summary, source=self.name) for summary in summaries]
return article

Expand All @@ -109,7 +92,7 @@ def read_entries(self, sort_by=None) -> Iterable[Article]:
query = self._query_items.options(joinedload(Article.summaries))
if sort_by is not None:
query = query.order_by(sort_by)

result = session.scalars(query)
for article in result.unique(): # removes duplicates
yield article
Expand Down Expand Up @@ -153,41 +136,14 @@ def get_item_key(self, item) -> str:
"""
return item.name

@staticmethod
def _normalize_url(url: str | None) -> str | None:
if not url:
return url

# ending '/'
url = url.rstrip("/")

# Remove http and use https consistently
url = url.replace("http://", "https://")

# Remove www
url = url.replace("https://www.", "https://")

# Remove index.html or index.htm
url = re.sub(r'/index\.html?$', '', url)

# Convert youtu.be links to youtube.com
url = url.replace("https://youtu.be/", "https://youtube.com/watch?v=")

# Additional rules for mirror domains can be added here

# agisafetyfundamentals.com -> aisafetyfundamentals.com
url = url.replace("https://agisafetyfundamentals.com", "https://aisafetyfundamentals.com")

return url

def _normalize_urls(self, urls: Iterable[str]) -> Set[str]:
return {self._normalize_url(url) for url in urls}
return {normalize_url(url) for url in urls}


def _load_outputted_items(self) -> Set[str]:
"""
Loads the outputted items from the database and returns them as a set.

if the done_key is not an attribute of Article, it will try to load it from the meta field.
"""
with make_session() as session:
Expand All @@ -207,7 +163,7 @@ def not_processed(self, item) -> bool:
# cause problems (e.g. massive RAM usage, big slow downs) then it will have to be switched around, so that
# this function runs a query to check if the item is in the database rather than first getting all done_keys.
# If it get's to that level, consider batching it somehow
return self._normalize_url(self.get_item_key(item)) not in self._outputted_items
return normalize_url(self.get_item_key(item)) not in self._outputted_items

def unprocessed_items(self, items=None) -> list | filter:
"""Return a list of all items to be processed.
Expand Down
60 changes: 60 additions & 0 deletions align_data/common/formatters.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import re
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nice

from typing import Any, Dict, List

from align_data.settings import ARTICLE_MAIN_KEYS
from align_data.sources.utils import merge_dicts


def normalize_url(url: str | None) -> str | None:
if not url:
return url

# ending '/'
url = url.rstrip("/")

# Remove http and use https consistently
url = url.replace("http://", "https://")

# Remove www
url = url.replace("https://www.", "https://")

# Remove index.html or index.htm
url = re.sub(r'/index\.html?$', '', url)

# Convert youtu.be links to youtube.com
url = url.replace("https://youtu.be/", "https://youtube.com/watch?v=")

# Additional rules for mirror domains can be added here

# agisafetyfundamentals.com -> aisafetyfundamentals.com
url = url.replace("https://agisafetyfundamentals.com", "https://aisafetyfundamentals.com")

return url


def normalize_text(text: str | None) -> str | None:
return (text or '').replace('\n', ' ').replace('\r', '').strip() or None


def format_authors(authors: List[str]) -> str:
# TODO: Don't keep adding the same authors - come up with some way to reuse them
authors_str = ",".join(authors)
if len(authors_str) > 1024:
authors_str = ",".join(authors_str[:1024].split(",")[:-1])
return authors_str


def article_dict(data, **kwargs) -> Dict[str, Any]:
data = merge_dicts(data, kwargs)

summaries = data.pop("summaries", [])
summary = data.pop("summary", None)

data['summaries'] = summaries + [summary] if summary else []
data['authors'] = format_authors(data.pop("authors", []))
data['title'] = normalize_text(data.get('title'))

return dict(
meta={k: v for k, v in data.items() if k not in ARTICLE_MAIN_KEYS and v is not None},
**{k: v for k, v in data.items() if k in ARTICLE_MAIN_KEYS},
)
1 change: 1 addition & 0 deletions align_data/db/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ class Article(Base):
date_updated: Mapped[Optional[datetime]] = mapped_column(
DateTime, onupdate=func.current_timestamp()
)
date_checked: Mapped[datetime] = mapped_column(DateTime, default=func.now()) # The timestamp when this article was last checked if still valid
status: Mapped[Optional[str]] = mapped_column(String(256))
comments: Mapped[Optional[str]] = mapped_column(LONGTEXT) # Editor comments. Can be anything

Expand Down
9 changes: 5 additions & 4 deletions align_data/sources/articles/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from sqlalchemy import select, Select

from align_data.common.alignment_dataset import AlignmentDataset
from align_data.common.formatters import normalize_url
from align_data.db.models import Article
from align_data.sources.articles.google_cloud import fetch_file, fetch_markdown
from align_data.sources.articles.parsers import (
Expand Down Expand Up @@ -127,16 +128,16 @@ def not_processed(self, item: tuple) -> bool:
url = self.maybe(item, "url")
source_url = self.maybe(item, "source_url")

if item_key and self._normalize_url(item_key) in self._outputted_items:
if item_key and normalize_url(item_key) in self._outputted_items:
return False

for given_url in [url, source_url]:
if given_url:
norm_url = self._normalize_url(given_url)
norm_url = normalize_url(given_url)
if norm_url in self._outputted_items:
return False

norm_canonical_url = self._normalize_url(arxiv_canonical_url(given_url))
norm_canonical_url = normalize_url(arxiv_canonical_url(given_url))
if norm_canonical_url in self._outputted_items:
return False

Expand Down
90 changes: 90 additions & 0 deletions align_data/sources/validate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
import logging
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't have a strong opinion on it, but would it make more sense to have this file in align_data/db, since it's used to validate db objects, rather than any source in particular (afaict)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sort of. It also makes use of the parsers, as it fetches data from the appropriate source etc., so dunno :/

from datetime import datetime, timedelta
from typing import Any, List

from tqdm import tqdm
from sqlalchemy.exc import IntegrityError
from align_data.common.formatters import normalize_url, normalize_text, article_dict
from align_data.db.session import make_session
from align_data.db.models import Article
from align_data.sources.articles.parsers import item_metadata
from align_data.sources.articles.html import fetch


logger = logging.getLogger(__name__)


def update_article_field(article: Article, field: str, value: Any):
if not value:
return

if field == 'url' and normalize_url(article.url) == normalize_url(value):
# This is pretty much the same url, so don't modify it
return
if field == 'title' and normalize_text(article.title) == normalize_text(value):
# If there are slight differences in the titles (e.g. punctuation), assume the
# database version is more correct
return
if field == 'meta':
article.meta = article.meta or {}
for k, v in value.items():
meta_val = article.meta.get(k)
try:
if not meta_val or v > meta_val:
article.meta[k] = v
except Exception as e:
# Ignore exceptions here - the metadata isn't that important
logger.info('Error checking metadata value for article %s: %s', article.url, value)
return

article_val = getattr(article, field, None)
# Assume that if the provided value is larger (or later, in the case of dates), then it's
# better. This might very well not hold, but it seems like a decent heuristic?
if not article_val:
setattr(article, field, value)
elif isinstance(value, datetime) and value > article_val:
setattr(article, field, value)
elif isinstance(value, str) and len(normalize_text(value) or '') > len(normalize_text(article_val) or ''):
setattr(article, field, normalize_text(value))
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the > heuristic seems fine since it will only do anything in the case of comparables like dates and numbers and idk what else, but imo, in other cases it would always update it? Like, if the text changed it would always be updated, rather than only updated when the length of the text grows, which seems like what's going on here. Lmk if I'm wrong

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Text changes seem to be:

  • slight edits for stylistic reasons etc., which often have a note attached to say when the edit happened - I'm hoping these will result in slightly larger texts
  • things being ammended - this should result in a larger article
  • things being removed, or the article disappearing with only an error message displayed

I didn't want to just overwrite the text if it changed, because the changes might not be for the better. So this seemed like a heuristic that might be better than that. Albeit not much..

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What are you thinking would be situations where the more recent changes are worse? If I'm understanding it right, "contents" will only be successful (no errors, non empty) when the url corresponds to a parsers-friendly url, ie a url that was originally fetched using the parsers, right? In that case, the newly fetched contents will be created through the same method as the original contents of the article, but will have (as the only difference) more recent data

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually, it's meant to also work with any pdf and epub. Is that the reason why the contents may be of lesser quality? if the url is not the thing we used to get the pdf in the first place, maybe this generic pdf parser will be worse than a specialized one. Seems tricky.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

exactly. Plus some websites return an error in HTML, but with a 200 code, or other such madness, so you can't really trust them :/
Though it might be better to just always replace things if possible...



def update_article(article: Article) -> Article:
"""Check whether there are better data for this article and whether its url is pointing somewhere decent."""
source_url = article.meta.get('source_url') or article.url
contents = {}
if source_url:
contents = item_metadata(source_url)

if 'error' not in contents:
for field, value in article_dict(contents).items():
update_article_field(article, field, value)
else:
logger.info('Error getting contents for %s: %s', article, contents.get('error'))

if 400 <= fetch(article.url).status_code < 500:
logger.info('Could not get url for %s', article)
article.status = 'Unreachable url'

article.date_checked = datetime.utcnow()

return article


def check_articles(sources: List[str], batch_size=100):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

similarly as above, maybe update_articles? since that's what it does, updates them to the newest contents if possible.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is really minor, but for consistency the first and second function should have the same name, except one plural, imo (ie update_article and update_articles or check_article and check_articles, since one just calls the other on a set of lots of articles

"""Check `batch_size` articles with the given `sources` to see if they have better data."""
logger.info('Checking %s articles for %s', batch_size, ', '.join(sources))
with make_session() as session:
for article in tqdm(
session.query(Article)
.filter(Article.date_checked < datetime.now() - timedelta(weeks=4))
.filter(Article.source.in_(sources))
.limit(batch_size)
.all()
):
update_article(article)
session.add(article)
logger.debug('commiting')
try:
session.commit()
except IntegrityError as e:
logger.error(e)
Loading
Loading