From 44c3e171d53c15c2caec53f2fa782e0ae6ecd593 Mon Sep 17 00:00:00 2001 From: Daniel O'Connell Date: Mon, 4 Sep 2023 15:45:13 +0200 Subject: [PATCH] seperate formatters file --- align_data/common/alignment_dataset.py | 62 +------------------ align_data/common/formatters.py | 60 ++++++++++++++++++ align_data/sources/articles/datasets.py | 3 +- align_data/sources/validate.py | 16 +++-- .../cfd1704ad799_date_checked_column.py | 2 +- tests/align_data/sources/test_validate.py | 14 ++--- 6 files changed, 83 insertions(+), 74 deletions(-) create mode 100644 align_data/common/formatters.py diff --git a/align_data/common/alignment_dataset.py b/align_data/common/alignment_dataset.py index 86a6ed13..06da73d7 100644 --- a/align_data/common/alignment_dataset.py +++ b/align_data/common/alignment_dataset.py @@ -1,11 +1,10 @@ -import re from datetime import datetime from itertools import islice import logging import time from dataclasses import dataclass, field, KW_ONLY from pathlib import Path -from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Generator +from typing import Iterable, List, Optional, Set, Tuple, Generator from sqlalchemy import select from sqlalchemy.exc import IntegrityError from sqlalchemy.orm import joinedload @@ -20,68 +19,11 @@ from align_data.db.models import Article, Summary from align_data.db.session import make_session -from align_data.settings import ARTICLE_MAIN_KEYS -from align_data.sources.utils import merge_dicts +from align_data.common.formatters import normalize_url, article_dict logger = logging.getLogger(__name__) - -def normalize_url(url: str | None) -> str | None: - if not url: - return url - - # ending '/' - url = url.rstrip("/") - - # Remove http and use https consistently - url = url.replace("http://", "https://") - - # Remove www - url = url.replace("https://www.", "https://") - - # Remove index.html or index.htm - url = re.sub(r'/index\.html?$', '', url) - - # Convert youtu.be links to youtube.com - url = url.replace("https://youtu.be/", "https://youtube.com/watch?v=") - - # Additional rules for mirror domains can be added here - - # agisafetyfundamentals.com -> aisafetyfundamentals.com - url = url.replace("https://agisafetyfundamentals.com", "https://aisafetyfundamentals.com") - - return url - - -def normalize_text(text: str | None) -> str | None: - return (text or '').replace('\n', ' ').replace('\r', '').strip() or None - - -def format_authors(authors: List[str]) -> str: - # TODO: Don't keep adding the same authors - come up with some way to reuse them - authors_str = ",".join(authors) - if len(authors_str) > 1024: - authors_str = ",".join(authors_str[:1024].split(",")[:-1]) - return authors_str - - -def article_dict(data, **kwargs) -> Dict[str, Any]: - data = merge_dicts(data, kwargs) - - summaries = data.pop("summaries", []) - summary = data.pop("summary", None) - - data['summaries'] = summaries + [summary] if summary else [] - data['authors'] = format_authors(data.pop("authors", [])) - data['title'] = normalize_text(data.get('title')) - - return dict( - meta={k: v for k, v in data.items() if k not in ARTICLE_MAIN_KEYS and v is not None}, - **{k: v for k, v in data.items() if k in ARTICLE_MAIN_KEYS}, - ) - - @dataclass class AlignmentDataset: """The base dataset class.""" diff --git a/align_data/common/formatters.py b/align_data/common/formatters.py new file mode 100644 index 00000000..ce2eb691 --- /dev/null +++ b/align_data/common/formatters.py @@ -0,0 +1,60 @@ +import re +from typing import Any, Dict, List + +from align_data.settings import ARTICLE_MAIN_KEYS +from align_data.sources.utils import merge_dicts + + +def normalize_url(url: str | None) -> str | None: + if not url: + return url + + # ending '/' + url = url.rstrip("/") + + # Remove http and use https consistently + url = url.replace("http://", "https://") + + # Remove www + url = url.replace("https://www.", "https://") + + # Remove index.html or index.htm + url = re.sub(r'/index\.html?$', '', url) + + # Convert youtu.be links to youtube.com + url = url.replace("https://youtu.be/", "https://youtube.com/watch?v=") + + # Additional rules for mirror domains can be added here + + # agisafetyfundamentals.com -> aisafetyfundamentals.com + url = url.replace("https://agisafetyfundamentals.com", "https://aisafetyfundamentals.com") + + return url + + +def normalize_text(text: str | None) -> str | None: + return (text or '').replace('\n', ' ').replace('\r', '').strip() or None + + +def format_authors(authors: List[str]) -> str: + # TODO: Don't keep adding the same authors - come up with some way to reuse them + authors_str = ",".join(authors) + if len(authors_str) > 1024: + authors_str = ",".join(authors_str[:1024].split(",")[:-1]) + return authors_str + + +def article_dict(data, **kwargs) -> Dict[str, Any]: + data = merge_dicts(data, kwargs) + + summaries = data.pop("summaries", []) + summary = data.pop("summary", None) + + data['summaries'] = summaries + [summary] if summary else [] + data['authors'] = format_authors(data.pop("authors", [])) + data['title'] = normalize_text(data.get('title')) + + return dict( + meta={k: v for k, v in data.items() if k not in ARTICLE_MAIN_KEYS and v is not None}, + **{k: v for k, v in data.items() if k in ARTICLE_MAIN_KEYS}, + ) diff --git a/align_data/sources/articles/datasets.py b/align_data/sources/articles/datasets.py index e8e0d317..69b2a358 100644 --- a/align_data/sources/articles/datasets.py +++ b/align_data/sources/articles/datasets.py @@ -11,7 +11,8 @@ from pypandoc import convert_file from sqlalchemy import select, Select -from align_data.common.alignment_dataset import AlignmentDataset, normalize_url +from align_data.common.alignment_dataset import AlignmentDataset +from align_data.common.formatters import normalize_url from align_data.db.models import Article from align_data.sources.articles.google_cloud import fetch_file, fetch_markdown from align_data.sources.articles.parsers import ( diff --git a/align_data/sources/validate.py b/align_data/sources/validate.py index a97b8b20..6f438f99 100644 --- a/align_data/sources/validate.py +++ b/align_data/sources/validate.py @@ -4,9 +4,9 @@ from tqdm import tqdm from sqlalchemy.exc import IntegrityError +from align_data.common.formatters import normalize_url, normalize_text, article_dict from align_data.db.session import make_session from align_data.db.models import Article -from align_data.common.alignment_dataset import normalize_url, normalize_text, article_dict from align_data.sources.articles.parsers import item_metadata from align_data.sources.articles.html import fetch @@ -29,8 +29,12 @@ def update_article_field(article: Article, field: str, value: Any): article.meta = article.meta or {} for k, v in value.items(): meta_val = article.meta.get(k) - if not meta_val or v > meta_val: - article.meta[k] = v + try: + if not meta_val or v > meta_val: + article.meta[k] = v + except Exception as e: + # Ignore exceptions here - the metadata isn't that important + logger.info('Error checking metadata value for article %s: %s', article.url, value) return article_val = getattr(article, field, None) @@ -44,7 +48,8 @@ def update_article_field(article: Article, field: str, value: Any): setattr(article, field, normalize_text(value)) -def check_article(article: Article) -> Article: +def update_article(article: Article) -> Article: + """Check whether there are better data for this article and whether its url is pointing somewhere decent.""" source_url = article.meta.get('source_url') or article.url contents = {} if source_url: @@ -66,6 +71,7 @@ def check_article(article: Article) -> Article: def check_articles(sources: List[str], batch_size=100): + """Check `batch_size` articles with the given `sources` to see if they have better data.""" logger.info('Checking %s articles for %s', batch_size, ', '.join(sources)) with make_session() as session: for article in tqdm( @@ -75,7 +81,7 @@ def check_articles(sources: List[str], batch_size=100): .limit(batch_size) .all() ): - check_article(article) + update_article(article) session.add(article) logger.debug('commiting') try: diff --git a/migrations/versions/cfd1704ad799_date_checked_column.py b/migrations/versions/cfd1704ad799_date_checked_column.py index 581c5c86..deb0d3e6 100644 --- a/migrations/versions/cfd1704ad799_date_checked_column.py +++ b/migrations/versions/cfd1704ad799_date_checked_column.py @@ -12,7 +12,7 @@ # revision identifiers, used by Alembic. revision = 'cfd1704ad799' -down_revision = 'f5a2bcfa6b2c' +down_revision = '1866340e456a' branch_labels = None depends_on = None diff --git a/tests/align_data/sources/test_validate.py b/tests/align_data/sources/test_validate.py index 940aae09..3eebc918 100644 --- a/tests/align_data/sources/test_validate.py +++ b/tests/align_data/sources/test_validate.py @@ -2,7 +2,7 @@ from unittest.mock import patch, Mock from align_data.db.models import Article -from align_data.sources.validate import update_article_field, check_article, check_articles +from align_data.sources.validate import update_article_field, update_article @pytest.mark.parametrize('url', ( @@ -72,7 +72,7 @@ def test_update_article_field_with_new_field_and_empty_value(): assert hasattr(article, 'description') == False -def test_check_article_gets_metadata(): +def test_update_article_gets_metadata(): data = { 'text': 'bla bla', 'source_url': 'http://ble.com', @@ -82,7 +82,7 @@ def test_check_article_gets_metadata(): article = Article(url='http://this.should.change', meta={}) with patch('align_data.sources.validate.item_metadata', return_value=data): with patch('align_data.sources.validate.fetch', return_value=Mock(status_code=200)): - check_article(article) + update_article(article) assert article.to_dict() == { 'authors': ['mr. blobby', 'johhny'], 'date_published': None, @@ -97,7 +97,7 @@ def test_check_article_gets_metadata(): } -def test_check_article_no_metadata(): +def test_update_article_no_metadata(): data = { 'error': 'this failed!', 'text': 'bla bla', @@ -108,7 +108,7 @@ def test_check_article_no_metadata(): article = Article(url='http://this.should.not.change', meta={}) with patch('align_data.sources.validate.item_metadata', return_value=data): with patch('align_data.sources.validate.fetch', return_value=Mock(status_code=200)): - check_article(article) + update_article(article) assert article.to_dict() == { 'authors': [], 'date_published': None, @@ -122,9 +122,9 @@ def test_check_article_no_metadata(): } -def test_check_article_bad_url(): +def test_update_article_bad_url(): article = Article(url='http://this.should.not.change', meta={}) with patch('align_data.sources.validate.item_metadata', return_value={}): with patch('align_data.sources.validate.fetch', return_value=Mock(status_code=400)): - check_article(article) + update_article(article) assert article.status == 'Unreachable url'