Article checker

StampyAI · Sep 3, 2023 · b856d07 · b856d07
1 parent 0bcf0d9
commit b856d07
Show file tree

Hide file tree

Showing 8 changed files with 384 additions and 56 deletions.
diff --git a/.github/workflows/check-articles.yml b/.github/workflows/check-articles.yml
@@ -0,0 +1,65 @@
+name: Check articles are valid
+
+on:
+  workflow_call:
+    inputs:
+      datasource:
+        type: string
+        required: true
+  workflow_dispatch: # allow manual triggering
+    inputs:
+      datasource:
+        description: 'The datasource to process'
+        type: choice
+        options:
+          - all
+          - agentmodels
+          - agisf
+          - aisafety.info
+          - alignment_newsletter
+          - alignmentforum
+          - arbital
+          - arxiv
+          - blogs
+          - distill
+          - eaforum
+          - indices
+          - lesswrong
+          - special_docs
+          - youtube
+  schedule:
+    - cron: "0 */4 * * *"  # Every 4 hours
+
+jobs:
+  build-dataset:
+    runs-on: ubuntu-latest
+
+    steps:
+    - name: Checkout repository
+      uses: actions/checkout@v2
+
+    - name: Setup Python environment
+      uses: actions/setup-python@v2
+      with:
+        python-version: '3.x'
+
+    - name: Install Pandoc
+      run: |
+        if [ "${{ inputs.datasource }}" = "gdocs" ]; then
+          sudo apt-get update
+          sudo apt-get -y install pandoc
+        fi
+
+    - name: Install dependencies
+      run: pip install -r requirements.txt
+
+    - name: Process dataset
+      env:
+        CODA_TOKEN: ${{ secrets.CODA_TOKEN }}
+        AIRTABLE_API_KEY: ${{ secrets.AIRTABLE_API_KEY }}
+        YOUTUBE_API_KEY: ${{ secrets.YOUTUBE_API_KEY }}
+        ARD_DB_USER: ${{ secrets.ARD_DB_USER }}
+        ARD_DB_PASSWORD: ${{ secrets.ARD_DB_PASSWORD }}
+        ARD_DB_HOST: ${{ secrets.ARD_DB_HOST }}
+        ARD_DB_NAME: alignment_research_dataset
+      run: python main.py fetch ${{ inputs.datasource }}
diff --git a/align_data/common/alignment_dataset.py b/align_data/common/alignment_dataset.py
@@ -5,7 +5,7 @@
 import time
 from dataclasses import dataclass, KW_ONLY
 from pathlib import Path
-from typing import Iterable, List, Optional, Set
+from typing import Any, Dict, Iterable, List, Optional, Set
 from sqlalchemy import select
 from sqlalchemy.exc import IntegrityError
 from sqlalchemy.orm import joinedload
@@ -23,6 +23,62 @@
 logger = logging.getLogger(__name__)
 
 
+
+def normalize_url(url: str | None) -> str | None:
+    if not url:
+        return url
+
+    # ending '/'
+    url = url.rstrip("/")
+
+    # Remove http and use https consistently
+    url = url.replace("http://", "https://")
+
+    # Remove www
+    url = url.replace("https://www.", "https://")
+
+    # Remove index.html or index.htm
+    url = re.sub(r'/index\.html?$', '', url)
+
+    # Convert youtu.be links to youtube.com
+    url = url.replace("https://youtu.be/", "https://youtube.com/watch?v=")
+
+    # Additional rules for mirror domains can be added here
+
+    # agisafetyfundamentals.com -> aisafetyfundamentals.com
+    url = url.replace("https://agisafetyfundamentals.com", "https://aisafetyfundamentals.com")
+
+    return url
+
+
+def normalize_text(text: str | None) -> str | None:
+    return (text or '').replace('\n', ' ').replace('\r', '').strip() or None
+
+
+def format_authors(authors: List[str]) -> str:
+    # TODO: Don't keep adding the same authors - come up with some way to reuse them
+    authors_str = ",".join(authors)
+    if len(authors_str) > 1024:
+        authors_str = ",".join(authors_str[:1024].split(",")[:-1])
+    return authors_str
+
+
+def article_dict(data, **kwargs) -> Dict[str, Any]:
+    data = merge_dicts(data, kwargs)
+
+    summaries = data.pop("summaries", [])
+    summary = data.pop("summary", None)
+
+    data['summaries'] = summaries + [summary] if summary else []
+    data['authors'] = format_authors(data.pop("authors", []))
+    data['title'] = normalize_text(data.get('title'))
+
+    return dict(
+        meta={k: v for k, v in data.items() if k not in ARTICLE_MAIN_KEYS and v is not None},
+        **{k: v for k, v in data.items() if k in ARTICLE_MAIN_KEYS},
+    )
+
+
 @dataclass
 class AlignmentDataset:
     """The base dataset class."""
@@ -62,28 +118,10 @@ def __post_init__(self, data_path=Path(__file__).parent / "../../data/"):
         # set the default place to look for data
         self.files_path = self.raw_data_path / self.name
 
-    def _add_authors(self, article: Article, authors: List[str]) -> Article:
-        # TODO: Don't keep adding the same authors - come up with some way to reuse them
-        article.authors = ",".join(authors)
-        if len(article.authors) > 1024:
-            article.authors = ",".join(article.authors[:1024].split(",")[:-1])
-        return article
-
     def make_data_entry(self, data, **kwargs) -> Article:
-        data = merge_dicts(data, kwargs)
-
-        summaries = data.pop("summaries", [])
-        summary = data.pop("summary", None)
-        summaries += [summary] if summary else []
-
-        authors = data.pop("authors", [])
-        data['title'] = (data.get('title') or '').replace('\n', ' ').replace('\r', '') or None
-
-        article = Article(
-            meta={k: v for k, v in data.items() if k not in ARTICLE_MAIN_KEYS and v is not None},
-            **{k: v for k, v in data.items() if k in ARTICLE_MAIN_KEYS},
-        )
-        self._add_authors(article, authors)
+        data = article_dict(data, **kwargs)
+        summaries = data.pop('summaries', [])
+        article = Article(**data)
         for summary in summaries: # Note: This will be skipped if summaries is empty
             article.summaries.append(Summary(text=summary, source=self.name))
         return article
@@ -152,35 +190,8 @@ def get_item_key(self, item) -> str:
         """
         return item.name
 
-    @staticmethod
-    def _normalize_url(url: str | None) -> str | None:
-        if not url:
-            return url
-
-        # ending '/'
-        url = url.rstrip("/")
-
-        # Remove http and use https consistently
-        url = url.replace("http://", "https://")
-
-        # Remove www
-        url = url.replace("https://www.", "https://")
-
-        # Remove index.html or index.htm
-        url = re.sub(r'/index\.html?$', '', url)
-
-        # Convert youtu.be links to youtube.com
-        url = url.replace("https://youtu.be/", "https://youtube.com/watch?v=")
-
-        # Additional rules for mirror domains can be added here
-
-        # agisafetyfundamentals.com -> aisafetyfundamentals.com
-        url = url.replace("https://agisafetyfundamentals.com", "https://aisafetyfundamentals.com")
-
-        return url
-
     def _normalize_urls(self, urls: Iterable[str]) -> Set[str]:
-        return {self._normalize_url(url) for url in urls}
+        return {normalize_url(url) for url in urls}
 
 
     def _load_outputted_items(self) -> Set[str]:
@@ -202,7 +213,7 @@ def not_processed(self, item) -> bool:
         # cause problems (e.g. massive RAM usage, big slow downs) then it will have to be switched around, so that
         # this function runs a query to check if the item is in the database rather than first getting all done_keys.
         # If it get's to that level, consider batching it somehow
-        return self._normalize_url(self.get_item_key(item)) not in self._outputted_items
+        return normalize_url(self.get_item_key(item)) not in self._outputted_items
 
     def unprocessed_items(self, items=None) -> Iterable:
         """Return a list of all items to be processed.

diff --git a/align_data/db/models.py b/align_data/db/models.py
@@ -63,6 +63,7 @@ class Article(Base):
     date_updated: Mapped[Optional[datetime]] = mapped_column(
         DateTime, onupdate=func.current_timestamp()
     )
+    date_checked: Mapped[datetime] = mapped_column(DateTime, default=func.now())  # The timestamp when this article was last checked if still valid
     status: Mapped[Optional[str]] = mapped_column(String(256))
     comments: Mapped[Optional[str]] = mapped_column(LONGTEXT)  # Editor comments. Can be anything
 

diff --git a/align_data/sources/articles/datasets.py b/align_data/sources/articles/datasets.py
@@ -10,7 +10,7 @@
 from pypandoc import convert_file
 from sqlalchemy import select
 
-from align_data.common.alignment_dataset import AlignmentDataset
+from align_data.common.alignment_dataset import AlignmentDataset, normalize_url
 from align_data.db.models import Article
 from align_data.sources.articles.google_cloud import fetch_file, fetch_markdown
 from align_data.sources.articles.parsers import (
@@ -122,16 +122,16 @@ def not_processed(self, item: tuple) -> bool:
         url = self.maybe(item, "url")
         source_url = self.maybe(item, "source_url")
 
-        if item_key and self._normalize_url(item_key) in self._outputted_items:
+        if item_key and normalize_url(item_key) in self._outputted_items:
             return False
-        
+
         for given_url in [url, source_url]:
             if given_url:
-                norm_url = self._normalize_url(given_url)
+                norm_url = normalize_url(given_url)
                 if norm_url in self._outputted_items:
                     return False
 
-                norm_canonical_url = self._normalize_url(arxiv_canonical_url(given_url))
+                norm_canonical_url = normalize_url(arxiv_canonical_url(given_url))
                 if norm_canonical_url in self._outputted_items:
                     return False
 

diff --git a/align_data/sources/validate.py b/align_data/sources/validate.py
@@ -0,0 +1,84 @@
+import logging
+from datetime import datetime, timedelta
+from typing import Any, List
+
+from tqdm import tqdm
+from sqlalchemy.exc import IntegrityError
+from align_data.db.session import make_session
+from align_data.db.models import Article
+from align_data.common.alignment_dataset import normalize_url, normalize_text, article_dict
+from align_data.sources.articles.parsers import item_metadata
+from align_data.sources.articles.html import fetch
+
+
+logger = logging.getLogger(__name__)
+
+
+def update_article_field(article: Article, field: str, value: Any):
+    if not value:
+        return
+
+    if field == 'url' and normalize_url(article.url) == normalize_url(value):
+        # This is pretty much the same url, so don't modify it
+        return
+    if field == 'title' and normalize_text(article.title) == normalize_text(value):
+        # If there are slight differences in the titles (e.g. punctuation), assume the
+        # database version is more correct
+        return
+    if field == 'meta':
+        article.meta = article.meta or {}
+        for k, v in value.items():
+            meta_val = article.meta.get(k)
+            if not meta_val or v > meta_val:
+                article.meta[k] = v
+        return
+
+    article_val = getattr(article, field, None)
+    # Assume that if the provided value is larger (or later, in the case of dates), then it's
+    # better. This might very well not hold, but it seems like a decent heuristic?
+    if not article_val:
+        setattr(article, field, value)
+    elif isinstance(value, datetime) and value > article_val:
+        setattr(article, field, value)
+    elif isinstance(value, str) and len(normalize_text(value) or '') > len(normalize_text(article_val) or ''):
+        setattr(article, field, normalize_text(value))
+
+
+def check_article(article: Article) -> Article:
+    source_url = article.meta.get('source_url') or article.url
+    contents = {}
+    if source_url:
+        contents = item_metadata(source_url)
+
+    if 'error' not in contents:
+        for field, value in article_dict(contents).items():
+            update_article_field(article, field, value)
+    else:
+        logger.info('Error getting contents for %s: %s', article, contents.get('error'))
+
+    if 400 <= fetch(article.url).status_code < 500:
+        logger.info('Could not get url for %s', article)
+        article.status = 'Unreachable url'
+
+    article.date_checked = datetime.utcnow()
+
+    return article
+
+
+def check_articles(sources: List[str], batch_size=100):
+    logger.info('Checking %s articles for %s', batch_size, ', '.join(sources))
+    with make_session() as session:
+        for article in tqdm(
+            session.query(Article)
+            .filter(Article.date_checked < datetime.now() - timedelta(weeks=4))
+            .filter(Article.source.in_(sources))
+            .limit(batch_size)
+            .all()
+        ):
+            check_article(article)
+            session.add(article)
+        logger.debug('commiting')
+        try:
+            session.commit()
+        except IntegrityError as e:
+            logger.error(e)
diff --git a/main.py b/main.py
@@ -14,6 +14,7 @@
 )
 from align_data.embeddings.pinecone.update_pinecone import PineconeUpdater
 from align_data.embeddings.finetuning.training import finetune_embeddings
+from align_data.sources.validate import check_articles
 from align_data.settings import (
     METADATA_OUTPUT_SPREADSHEET,
     METADATA_SOURCE_SHEET,
@@ -151,6 +152,14 @@ def train_finetuning_layer(self) -> None:
         """
         finetune_embeddings()
 
+    def validate_articles(self, *names, n=100) -> None:
+        """Check n articles to see whether their data is correct and that their urls point to valid addresses."""
+        if names == ("all",):
+            names = ALL_DATASETS
+        missing = {name for name in names if name not in ALL_DATASETS}
+        assert not missing, f"{missing} are not valid dataset names"
+        check_articles(names, n)
+
 
 if __name__ == "__main__":
     fire.Fire(AlignmentDataset)
diff --git a/migrations/versions/cfd1704ad799_date_checked_column.py b/migrations/versions/cfd1704ad799_date_checked_column.py
@@ -0,0 +1,28 @@
+"""date_checked column
+
+Revision ID: cfd1704ad799
+Revises: f5a2bcfa6b2c
+Create Date: 2023-09-03 18:57:35.390670
+
+"""
+from alembic import op
+import sqlalchemy as sa
+from sqlalchemy.dialects import mysql
+
+
+# revision identifiers, used by Alembic.
+revision = 'cfd1704ad799'
+down_revision = 'f5a2bcfa6b2c'
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    op.add_column('articles', sa.Column('date_checked', sa.DateTime(), nullable=True))
+    # Set a random day in the past for the last check, so that the existing articles get checked randomly
+    op.execute('UPDATE articles SET date_checked = DATE_SUB(NOW(), INTERVAL FLOOR(RAND() * 101) DAY)')
+    op.alter_column('articles', 'date_checked', existing_type=mysql.DATETIME(), nullable=False)
+
+
+def downgrade() -> None:
+    op.drop_column('articles', 'date_checked')