seperate formatters file

StampyAI · Sep 4, 2023 · 130056a · 130056a
1 parent 5b692af
commit 130056a
Show file tree

Hide file tree

Showing 6 changed files with 83 additions and 74 deletions.
diff --git a/align_data/common/alignment_dataset.py b/align_data/common/alignment_dataset.py
@@ -1,11 +1,10 @@
-import re
 from datetime import datetime
 from itertools import islice
 import logging
 import time
 from dataclasses import dataclass, KW_ONLY
 from pathlib import Path
-from typing import Any, Dict, Iterable, List, Optional, Set
+from typing import Iterable, List, Optional, Set
 from sqlalchemy import select
 from sqlalchemy.exc import IntegrityError
 from sqlalchemy.orm import joinedload
@@ -16,69 +15,12 @@
 from tqdm import tqdm
 from align_data.db.models import Article, Summary
 from align_data.db.session import make_session
-from align_data.settings import ARTICLE_MAIN_KEYS
-from align_data.sources.utils import merge_dicts
+from align_data.common.formatters import normalize_url, article_dict
 
 
 logger = logging.getLogger(__name__)
 
 
-
-def normalize_url(url: str | None) -> str | None:
-    if not url:
-        return url
-
-    # ending '/'
-    url = url.rstrip("/")
-
-    # Remove http and use https consistently
-    url = url.replace("http://", "https://")
-
-    # Remove www
-    url = url.replace("https://www.", "https://")
-
-    # Remove index.html or index.htm
-    url = re.sub(r'/index\.html?$', '', url)
-
-    # Convert youtu.be links to youtube.com
-    url = url.replace("https://youtu.be/", "https://youtube.com/watch?v=")
-
-    # Additional rules for mirror domains can be added here
-
-    # agisafetyfundamentals.com -> aisafetyfundamentals.com
-    url = url.replace("https://agisafetyfundamentals.com", "https://aisafetyfundamentals.com")
-
-    return url
-
-
-def normalize_text(text: str | None) -> str | None:
-    return (text or '').replace('\n', ' ').replace('\r', '').strip() or None
-
-
-def format_authors(authors: List[str]) -> str:
-    # TODO: Don't keep adding the same authors - come up with some way to reuse them
-    authors_str = ",".join(authors)
-    if len(authors_str) > 1024:
-        authors_str = ",".join(authors_str[:1024].split(",")[:-1])
-    return authors_str
-
-
-def article_dict(data, **kwargs) -> Dict[str, Any]:
-    data = merge_dicts(data, kwargs)
-
-    summaries = data.pop("summaries", [])
-    summary = data.pop("summary", None)
-
-    data['summaries'] = summaries + [summary] if summary else []
-    data['authors'] = format_authors(data.pop("authors", []))
-    data['title'] = normalize_text(data.get('title'))
-
-    return dict(
-        meta={k: v for k, v in data.items() if k not in ARTICLE_MAIN_KEYS and v is not None},
-        **{k: v for k, v in data.items() if k in ARTICLE_MAIN_KEYS},
-    )
-
-
 @dataclass
 class AlignmentDataset:
     """The base dataset class."""

diff --git a/align_data/common/formatters.py b/align_data/common/formatters.py
@@ -0,0 +1,60 @@
+import re
+from typing import Any, Dict, List
+
+from align_data.settings import ARTICLE_MAIN_KEYS
+from align_data.sources.utils import merge_dicts
+
+
+def normalize_url(url: str | None) -> str | None:
+    if not url:
+        return url
+
+    # ending '/'
+    url = url.rstrip("/")
+
+    # Remove http and use https consistently
+    url = url.replace("http://", "https://")
+
+    # Remove www
+    url = url.replace("https://www.", "https://")
+
+    # Remove index.html or index.htm
+    url = re.sub(r'/index\.html?$', '', url)
+
+    # Convert youtu.be links to youtube.com
+    url = url.replace("https://youtu.be/", "https://youtube.com/watch?v=")
+
+    # Additional rules for mirror domains can be added here
+
+    # agisafetyfundamentals.com -> aisafetyfundamentals.com
+    url = url.replace("https://agisafetyfundamentals.com", "https://aisafetyfundamentals.com")
+
+    return url
+
+
+def normalize_text(text: str | None) -> str | None:
+    return (text or '').replace('\n', ' ').replace('\r', '').strip() or None
+
+
+def format_authors(authors: List[str]) -> str:
+    # TODO: Don't keep adding the same authors - come up with some way to reuse them
+    authors_str = ",".join(authors)
+    if len(authors_str) > 1024:
+        authors_str = ",".join(authors_str[:1024].split(",")[:-1])
+    return authors_str
+
+
+def article_dict(data, **kwargs) -> Dict[str, Any]:
+    data = merge_dicts(data, kwargs)
+
+    summaries = data.pop("summaries", [])
+    summary = data.pop("summary", None)
+
+    data['summaries'] = summaries + [summary] if summary else []
+    data['authors'] = format_authors(data.pop("authors", []))
+    data['title'] = normalize_text(data.get('title'))
+
+    return dict(
+        meta={k: v for k, v in data.items() if k not in ARTICLE_MAIN_KEYS and v is not None},
+        **{k: v for k, v in data.items() if k in ARTICLE_MAIN_KEYS},
+    )
diff --git a/align_data/sources/articles/datasets.py b/align_data/sources/articles/datasets.py
@@ -10,7 +10,8 @@
 from pypandoc import convert_file
 from sqlalchemy import select
 
-from align_data.common.alignment_dataset import AlignmentDataset, normalize_url
+from align_data.common.alignment_dataset import AlignmentDataset
+from align_data.common.formatters import normalize_url
 from align_data.db.models import Article
 from align_data.sources.articles.google_cloud import fetch_file, fetch_markdown
 from align_data.sources.articles.parsers import (

diff --git a/align_data/sources/validate.py b/align_data/sources/validate.py
@@ -4,9 +4,9 @@
 
 from tqdm import tqdm
 from sqlalchemy.exc import IntegrityError
+from align_data.common.formatters import normalize_url, normalize_text, article_dict
 from align_data.db.session import make_session
 from align_data.db.models import Article
-from align_data.common.alignment_dataset import normalize_url, normalize_text, article_dict
 from align_data.sources.articles.parsers import item_metadata
 from align_data.sources.articles.html import fetch
 
@@ -29,8 +29,12 @@ def update_article_field(article: Article, field: str, value: Any):
         article.meta = article.meta or {}
         for k, v in value.items():
             meta_val = article.meta.get(k)
-            if not meta_val or v > meta_val:
-                article.meta[k] = v
+            try:
+                if not meta_val or v > meta_val:
+                    article.meta[k] = v
+            except Exception as e:
+                # Ignore exceptions here - the metadata isn't that important
+                logger.info('Error checking metadata value: %s', value)
         return
 
     article_val = getattr(article, field, None)
@@ -44,7 +48,8 @@ def update_article_field(article: Article, field: str, value: Any):
         setattr(article, field, normalize_text(value))
 
 
-def check_article(article: Article) -> Article:
+def update_article(article: Article) -> Article:
+    """Check whether there are better data for this article and whether its url is pointing somewhere decent."""
     source_url = article.meta.get('source_url') or article.url
     contents = {}
     if source_url:
@@ -66,6 +71,7 @@ def check_article(article: Article) -> Article:
 
 
 def check_articles(sources: List[str], batch_size=100):
+    """Check `batch_size` articles with the given `sources` to see if they have better data."""
     logger.info('Checking %s articles for %s', batch_size, ', '.join(sources))
     with make_session() as session:
         for article in tqdm(
@@ -75,7 +81,7 @@ def check_articles(sources: List[str], batch_size=100):
             .limit(batch_size)
             .all()
         ):
-            check_article(article)
+            update_article(article)
             session.add(article)
         logger.debug('commiting')
         try:

diff --git a/migrations/versions/cfd1704ad799_date_checked_column.py b/migrations/versions/cfd1704ad799_date_checked_column.py
@@ -12,7 +12,7 @@
 
 # revision identifiers, used by Alembic.
 revision = 'cfd1704ad799'
-down_revision = 'f5a2bcfa6b2c'
+down_revision = '1866340e456a'
 branch_labels = None
 depends_on = None
 

diff --git a/tests/align_data/sources/test_validate.py b/tests/align_data/sources/test_validate.py
@@ -2,7 +2,7 @@
 from unittest.mock import patch, Mock
 
 from align_data.db.models import Article
-from align_data.sources.validate import update_article_field, check_article, check_articles
+from align_data.sources.validate import update_article_field, update_article
 
 
 @pytest.mark.parametrize('url', (
@@ -72,7 +72,7 @@ def test_update_article_field_with_new_field_and_empty_value():
     assert hasattr(article, 'description') == False
 
 
-def test_check_article_gets_metadata():
+def test_update_article_gets_metadata():
     data = {
         'text': 'bla bla',
         'source_url': 'http://ble.com',
@@ -82,7 +82,7 @@ def test_check_article_gets_metadata():
     article = Article(url='http://this.should.change', meta={})
     with patch('align_data.sources.validate.item_metadata', return_value=data):
         with patch('align_data.sources.validate.fetch', return_value=Mock(status_code=200)):
-            check_article(article)
+            update_article(article)
             assert article.to_dict() == {
                 'authors': ['mr. blobby', 'johhny'],
                 'date_published': None,
@@ -97,7 +97,7 @@ def test_check_article_gets_metadata():
             }
 
 
-def test_check_article_no_metadata():
+def test_update_article_no_metadata():
     data = {
         'error': 'this failed!',
         'text': 'bla bla',
@@ -108,7 +108,7 @@ def test_check_article_no_metadata():
     article = Article(url='http://this.should.not.change', meta={})
     with patch('align_data.sources.validate.item_metadata', return_value=data):
         with patch('align_data.sources.validate.fetch', return_value=Mock(status_code=200)):
-            check_article(article)
+            update_article(article)
             assert article.to_dict() == {
                 'authors': [],
                 'date_published': None,
@@ -122,9 +122,9 @@ def test_check_article_no_metadata():
             }
 
 
-def test_check_article_bad_url():
+def test_update_article_bad_url():
     article = Article(url='http://this.should.not.change', meta={})
     with patch('align_data.sources.validate.item_metadata', return_value={}):
         with patch('align_data.sources.validate.fetch', return_value=Mock(status_code=400)):
-            check_article(article)
+            update_article(article)
             assert article.status == 'Unreachable url'