diff --git a/.github/workflows/check-articles.yml b/.github/workflows/check-articles.yml new file mode 100644 index 00000000..2e29a41f --- /dev/null +++ b/.github/workflows/check-articles.yml @@ -0,0 +1,65 @@ +name: Check articles are valid + +on: + workflow_call: + inputs: + datasource: + type: string + required: true + workflow_dispatch: # allow manual triggering + inputs: + datasource: + description: 'The datasource to process' + type: choice + options: + - all + - agentmodels + - agisf + - aisafety.info + - alignment_newsletter + - alignmentforum + - arbital + - arxiv + - blogs + - distill + - eaforum + - indices + - lesswrong + - special_docs + - youtube + schedule: + - cron: "0 */4 * * *" # Every 4 hours + +jobs: + build-dataset: + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v2 + + - name: Setup Python environment + uses: actions/setup-python@v2 + with: + python-version: '3.x' + + - name: Install Pandoc + run: | + if [ "${{ inputs.datasource }}" = "gdocs" ]; then + sudo apt-get update + sudo apt-get -y install pandoc + fi + + - name: Install dependencies + run: pip install -r requirements.txt + + - name: Process dataset + env: + CODA_TOKEN: ${{ secrets.CODA_TOKEN }} + AIRTABLE_API_KEY: ${{ secrets.AIRTABLE_API_KEY }} + YOUTUBE_API_KEY: ${{ secrets.YOUTUBE_API_KEY }} + ARD_DB_USER: ${{ secrets.ARD_DB_USER }} + ARD_DB_PASSWORD: ${{ secrets.ARD_DB_PASSWORD }} + ARD_DB_HOST: ${{ secrets.ARD_DB_HOST }} + ARD_DB_NAME: alignment_research_dataset + run: python main.py fetch ${{ inputs.datasource }} diff --git a/align_data/common/alignment_dataset.py b/align_data/common/alignment_dataset.py index 67b4bfe5..deaee140 100644 --- a/align_data/common/alignment_dataset.py +++ b/align_data/common/alignment_dataset.py @@ -5,7 +5,7 @@ import time from dataclasses import dataclass, KW_ONLY from pathlib import Path -from typing import Iterable, List, Optional, Set +from typing import Any, Dict, Iterable, List, Optional, Set from sqlalchemy import select from sqlalchemy.exc import IntegrityError from sqlalchemy.orm import joinedload @@ -23,6 +23,62 @@ logger = logging.getLogger(__name__) + +def normalize_url(url: str | None) -> str | None: + if not url: + return url + + # ending '/' + url = url.rstrip("/") + + # Remove http and use https consistently + url = url.replace("http://", "https://") + + # Remove www + url = url.replace("https://www.", "https://") + + # Remove index.html or index.htm + url = re.sub(r'/index\.html?$', '', url) + + # Convert youtu.be links to youtube.com + url = url.replace("https://youtu.be/", "https://youtube.com/watch?v=") + + # Additional rules for mirror domains can be added here + + # agisafetyfundamentals.com -> aisafetyfundamentals.com + url = url.replace("https://agisafetyfundamentals.com", "https://aisafetyfundamentals.com") + + return url + + +def normalize_text(text: str | None) -> str | None: + return (text or '').replace('\n', ' ').replace('\r', '').strip() or None + + +def format_authors(authors: List[str]) -> str: + # TODO: Don't keep adding the same authors - come up with some way to reuse them + authors_str = ",".join(authors) + if len(authors_str) > 1024: + authors_str = ",".join(authors_str[:1024].split(",")[:-1]) + return authors_str + + +def article_dict(data, **kwargs) -> Dict[str, Any]: + data = merge_dicts(data, kwargs) + + summaries = data.pop("summaries", []) + summary = data.pop("summary", None) + + data['summaries'] = summaries + [summary] if summary else [] + data['authors'] = format_authors(data.pop("authors", [])) + data['title'] = normalize_text(data.get('title')) + + return dict( + meta={k: v for k, v in data.items() if k not in ARTICLE_MAIN_KEYS and v is not None}, + **{k: v for k, v in data.items() if k in ARTICLE_MAIN_KEYS}, + ) + + @dataclass class AlignmentDataset: """The base dataset class.""" @@ -62,28 +118,10 @@ def __post_init__(self, data_path=Path(__file__).parent / "../../data/"): # set the default place to look for data self.files_path = self.raw_data_path / self.name - def _add_authors(self, article: Article, authors: List[str]) -> Article: - # TODO: Don't keep adding the same authors - come up with some way to reuse them - article.authors = ",".join(authors) - if len(article.authors) > 1024: - article.authors = ",".join(article.authors[:1024].split(",")[:-1]) - return article - def make_data_entry(self, data, **kwargs) -> Article: - data = merge_dicts(data, kwargs) - - summaries = data.pop("summaries", []) - summary = data.pop("summary", None) - summaries += [summary] if summary else [] - - authors = data.pop("authors", []) - data['title'] = (data.get('title') or '').replace('\n', ' ').replace('\r', '') or None - - article = Article( - meta={k: v for k, v in data.items() if k not in ARTICLE_MAIN_KEYS and v is not None}, - **{k: v for k, v in data.items() if k in ARTICLE_MAIN_KEYS}, - ) - self._add_authors(article, authors) + data = article_dict(data, **kwargs) + summaries = data.pop('summaries', []) + article = Article(**data) for summary in summaries: # Note: This will be skipped if summaries is empty article.summaries.append(Summary(text=summary, source=self.name)) return article @@ -152,35 +190,8 @@ def get_item_key(self, item) -> str: """ return item.name - @staticmethod - def _normalize_url(url: str | None) -> str | None: - if not url: - return url - - # ending '/' - url = url.rstrip("/") - - # Remove http and use https consistently - url = url.replace("http://", "https://") - - # Remove www - url = url.replace("https://www.", "https://") - - # Remove index.html or index.htm - url = re.sub(r'/index\.html?$', '', url) - - # Convert youtu.be links to youtube.com - url = url.replace("https://youtu.be/", "https://youtube.com/watch?v=") - - # Additional rules for mirror domains can be added here - - # agisafetyfundamentals.com -> aisafetyfundamentals.com - url = url.replace("https://agisafetyfundamentals.com", "https://aisafetyfundamentals.com") - - return url - def _normalize_urls(self, urls: Iterable[str]) -> Set[str]: - return {self._normalize_url(url) for url in urls} + return {normalize_url(url) for url in urls} def _load_outputted_items(self) -> Set[str]: @@ -202,7 +213,7 @@ def not_processed(self, item) -> bool: # cause problems (e.g. massive RAM usage, big slow downs) then it will have to be switched around, so that # this function runs a query to check if the item is in the database rather than first getting all done_keys. # If it get's to that level, consider batching it somehow - return self._normalize_url(self.get_item_key(item)) not in self._outputted_items + return normalize_url(self.get_item_key(item)) not in self._outputted_items def unprocessed_items(self, items=None) -> Iterable: """Return a list of all items to be processed. diff --git a/align_data/db/models.py b/align_data/db/models.py index ae24650d..b0658964 100644 --- a/align_data/db/models.py +++ b/align_data/db/models.py @@ -63,6 +63,7 @@ class Article(Base): date_updated: Mapped[Optional[datetime]] = mapped_column( DateTime, onupdate=func.current_timestamp() ) + date_checked: Mapped[datetime] = mapped_column(DateTime, default=func.now()) # The timestamp when this article was last checked if still valid status: Mapped[Optional[str]] = mapped_column(String(256)) comments: Mapped[Optional[str]] = mapped_column(LONGTEXT) # Editor comments. Can be anything diff --git a/align_data/sources/articles/datasets.py b/align_data/sources/articles/datasets.py index cbf7f9d9..1c322235 100644 --- a/align_data/sources/articles/datasets.py +++ b/align_data/sources/articles/datasets.py @@ -10,7 +10,7 @@ from pypandoc import convert_file from sqlalchemy import select -from align_data.common.alignment_dataset import AlignmentDataset +from align_data.common.alignment_dataset import AlignmentDataset, normalize_url from align_data.db.models import Article from align_data.sources.articles.google_cloud import fetch_file, fetch_markdown from align_data.sources.articles.parsers import ( @@ -122,16 +122,16 @@ def not_processed(self, item: tuple) -> bool: url = self.maybe(item, "url") source_url = self.maybe(item, "source_url") - if item_key and self._normalize_url(item_key) in self._outputted_items: + if item_key and normalize_url(item_key) in self._outputted_items: return False - + for given_url in [url, source_url]: if given_url: - norm_url = self._normalize_url(given_url) + norm_url = normalize_url(given_url) if norm_url in self._outputted_items: return False - norm_canonical_url = self._normalize_url(arxiv_canonical_url(given_url)) + norm_canonical_url = normalize_url(arxiv_canonical_url(given_url)) if norm_canonical_url in self._outputted_items: return False diff --git a/align_data/sources/validate.py b/align_data/sources/validate.py new file mode 100644 index 00000000..a97b8b20 --- /dev/null +++ b/align_data/sources/validate.py @@ -0,0 +1,84 @@ +import logging +from datetime import datetime, timedelta +from typing import Any, List + +from tqdm import tqdm +from sqlalchemy.exc import IntegrityError +from align_data.db.session import make_session +from align_data.db.models import Article +from align_data.common.alignment_dataset import normalize_url, normalize_text, article_dict +from align_data.sources.articles.parsers import item_metadata +from align_data.sources.articles.html import fetch + + +logger = logging.getLogger(__name__) + + +def update_article_field(article: Article, field: str, value: Any): + if not value: + return + + if field == 'url' and normalize_url(article.url) == normalize_url(value): + # This is pretty much the same url, so don't modify it + return + if field == 'title' and normalize_text(article.title) == normalize_text(value): + # If there are slight differences in the titles (e.g. punctuation), assume the + # database version is more correct + return + if field == 'meta': + article.meta = article.meta or {} + for k, v in value.items(): + meta_val = article.meta.get(k) + if not meta_val or v > meta_val: + article.meta[k] = v + return + + article_val = getattr(article, field, None) + # Assume that if the provided value is larger (or later, in the case of dates), then it's + # better. This might very well not hold, but it seems like a decent heuristic? + if not article_val: + setattr(article, field, value) + elif isinstance(value, datetime) and value > article_val: + setattr(article, field, value) + elif isinstance(value, str) and len(normalize_text(value) or '') > len(normalize_text(article_val) or ''): + setattr(article, field, normalize_text(value)) + + +def check_article(article: Article) -> Article: + source_url = article.meta.get('source_url') or article.url + contents = {} + if source_url: + contents = item_metadata(source_url) + + if 'error' not in contents: + for field, value in article_dict(contents).items(): + update_article_field(article, field, value) + else: + logger.info('Error getting contents for %s: %s', article, contents.get('error')) + + if 400 <= fetch(article.url).status_code < 500: + logger.info('Could not get url for %s', article) + article.status = 'Unreachable url' + + article.date_checked = datetime.utcnow() + + return article + + +def check_articles(sources: List[str], batch_size=100): + logger.info('Checking %s articles for %s', batch_size, ', '.join(sources)) + with make_session() as session: + for article in tqdm( + session.query(Article) + .filter(Article.date_checked < datetime.now() - timedelta(weeks=4)) + .filter(Article.source.in_(sources)) + .limit(batch_size) + .all() + ): + check_article(article) + session.add(article) + logger.debug('commiting') + try: + session.commit() + except IntegrityError as e: + logger.error(e) diff --git a/main.py b/main.py index 82c30f07..e12f32b7 100644 --- a/main.py +++ b/main.py @@ -14,6 +14,7 @@ ) from align_data.embeddings.pinecone.update_pinecone import PineconeUpdater from align_data.embeddings.finetuning.training import finetune_embeddings +from align_data.sources.validate import check_articles from align_data.settings import ( METADATA_OUTPUT_SPREADSHEET, METADATA_SOURCE_SHEET, @@ -151,6 +152,14 @@ def train_finetuning_layer(self) -> None: """ finetune_embeddings() + def validate_articles(self, *names, n=100) -> None: + """Check n articles to see whether their data is correct and that their urls point to valid addresses.""" + if names == ("all",): + names = ALL_DATASETS + missing = {name for name in names if name not in ALL_DATASETS} + assert not missing, f"{missing} are not valid dataset names" + check_articles(names, n) + if __name__ == "__main__": fire.Fire(AlignmentDataset) diff --git a/migrations/versions/cfd1704ad799_date_checked_column.py b/migrations/versions/cfd1704ad799_date_checked_column.py new file mode 100644 index 00000000..581c5c86 --- /dev/null +++ b/migrations/versions/cfd1704ad799_date_checked_column.py @@ -0,0 +1,28 @@ +"""date_checked column + +Revision ID: cfd1704ad799 +Revises: f5a2bcfa6b2c +Create Date: 2023-09-03 18:57:35.390670 + +""" +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects import mysql + + +# revision identifiers, used by Alembic. +revision = 'cfd1704ad799' +down_revision = 'f5a2bcfa6b2c' +branch_labels = None +depends_on = None + + +def upgrade() -> None: + op.add_column('articles', sa.Column('date_checked', sa.DateTime(), nullable=True)) + # Set a random day in the past for the last check, so that the existing articles get checked randomly + op.execute('UPDATE articles SET date_checked = DATE_SUB(NOW(), INTERVAL FLOOR(RAND() * 101) DAY)') + op.alter_column('articles', 'date_checked', existing_type=mysql.DATETIME(), nullable=False) + + +def downgrade() -> None: + op.drop_column('articles', 'date_checked') diff --git a/tests/align_data/sources/test_validate.py b/tests/align_data/sources/test_validate.py new file mode 100644 index 00000000..940aae09 --- /dev/null +++ b/tests/align_data/sources/test_validate.py @@ -0,0 +1,130 @@ +import pytest +from unittest.mock import patch, Mock + +from align_data.db.models import Article +from align_data.sources.validate import update_article_field, check_article, check_articles + + +@pytest.mark.parametrize('url', ( + None, '', + 'http://blablabla.g', # this isn't longer + 'http://example.org', + 'https://example.org', + 'https://www.example.org', +)) +def test_update_article_url_no_replace(url): + article = Article(url='https://example.org', title='Example') + update_article_field(article, 'url', url) + assert article.url == 'https://example.org' + + +@pytest.mark.parametrize('url', ( + 'http://bla.bla.bla.org', + 'https://example.org/ble/ble', +)) +def test_update_article_url_replace(url): + article = Article(url='https://example.com', title='Example') + update_article_field(article, 'url', url) + assert article.url == url + + +@pytest.mark.parametrize('title', ( + None, '', + 'ExAmPlE', + 'Example', + ' Example ', + ' Example\n\n ', + + '1234567', # This different title is shorter, so won't be changed +)) +def test_update_article_title_no_replace(title): + article = Article(url='https://example.com', title='Example') + update_article_field(article, 'title', title) + assert article.title == 'Example' + + +@pytest.mark.parametrize('title', ( + 'ExAmPlEs', + 'Some other title', +)) +def test_update_article_title_replace(title): + article = Article(url='https://example.com', title='Example') + update_article_field(article, 'title', title) + assert article.title == title + + + +def test_update_article_field_with_meta(): + article = Article(url='https://example.com', title='Example', meta={'bla': 'ble'}) + update_article_field(article, 'meta', {'bla': 'asd', 'ble': 123}) + assert article.meta == {'bla': 'ble', 'ble': 123} + + +def test_update_article_field_with_new_field(): + article = Article(url='https://example.com', title='Example') + update_article_field(article, 'description', 'This is an example article.') + assert article.description == 'This is an example article.' + + +def test_update_article_field_with_new_field_and_empty_value(): + article = Article(url='https://example.com', title='Example') + update_article_field(article, 'description', None) + assert hasattr(article, 'description') == False + + +def test_check_article_gets_metadata(): + data = { + 'text': 'bla bla', + 'source_url': 'http://ble.com', + 'url': 'http://pretty.url', + 'authors': ['mr. blobby', 'johhny'], + } + article = Article(url='http://this.should.change', meta={}) + with patch('align_data.sources.validate.item_metadata', return_value=data): + with patch('align_data.sources.validate.fetch', return_value=Mock(status_code=200)): + check_article(article) + assert article.to_dict() == { + 'authors': ['mr. blobby', 'johhny'], + 'date_published': None, + 'id': None, + 'source': None, + 'source_type': None, + 'source_url': 'http://ble.com', + 'summaries': [], + 'text': 'bla bla', + 'title': None, + 'url': 'http://this.should.change', + } + + +def test_check_article_no_metadata(): + data = { + 'error': 'this failed!', + 'text': 'bla bla', + 'source_url': 'http://ble.com', + 'url': 'http://pretty.url', + 'authors': ['mr. blobby', 'johhny'], + } + article = Article(url='http://this.should.not.change', meta={}) + with patch('align_data.sources.validate.item_metadata', return_value=data): + with patch('align_data.sources.validate.fetch', return_value=Mock(status_code=200)): + check_article(article) + assert article.to_dict() == { + 'authors': [], + 'date_published': None, + 'id': None, + 'source': None, + 'source_type': None, + 'summaries': [], + 'text': None, + 'title': None, + 'url': 'http://this.should.not.change', + } + + +def test_check_article_bad_url(): + article = Article(url='http://this.should.not.change', meta={}) + with patch('align_data.sources.validate.item_metadata', return_value={}): + with patch('align_data.sources.validate.fetch', return_value=Mock(status_code=400)): + check_article(article) + assert article.status == 'Unreachable url'