Skip to content

Commit

Permalink
seperate formatters file
Browse files Browse the repository at this point in the history
  • Loading branch information
mruwnik committed Sep 4, 2023
1 parent 5b692af commit 130056a
Show file tree
Hide file tree
Showing 6 changed files with 83 additions and 74 deletions.
62 changes: 2 additions & 60 deletions align_data/common/alignment_dataset.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
import re
from datetime import datetime
from itertools import islice
import logging
import time
from dataclasses import dataclass, KW_ONLY
from pathlib import Path
from typing import Any, Dict, Iterable, List, Optional, Set
from typing import Iterable, List, Optional, Set
from sqlalchemy import select
from sqlalchemy.exc import IntegrityError
from sqlalchemy.orm import joinedload
Expand All @@ -16,69 +15,12 @@
from tqdm import tqdm
from align_data.db.models import Article, Summary
from align_data.db.session import make_session
from align_data.settings import ARTICLE_MAIN_KEYS
from align_data.sources.utils import merge_dicts
from align_data.common.formatters import normalize_url, article_dict


logger = logging.getLogger(__name__)



def normalize_url(url: str | None) -> str | None:
if not url:
return url

# ending '/'
url = url.rstrip("/")

# Remove http and use https consistently
url = url.replace("http://", "https://")

# Remove www
url = url.replace("https://www.", "https://")

# Remove index.html or index.htm
url = re.sub(r'/index\.html?$', '', url)

# Convert youtu.be links to youtube.com
url = url.replace("https://youtu.be/", "https://youtube.com/watch?v=")

# Additional rules for mirror domains can be added here

# agisafetyfundamentals.com -> aisafetyfundamentals.com
url = url.replace("https://agisafetyfundamentals.com", "https://aisafetyfundamentals.com")

return url


def normalize_text(text: str | None) -> str | None:
return (text or '').replace('\n', ' ').replace('\r', '').strip() or None


def format_authors(authors: List[str]) -> str:
# TODO: Don't keep adding the same authors - come up with some way to reuse them
authors_str = ",".join(authors)
if len(authors_str) > 1024:
authors_str = ",".join(authors_str[:1024].split(",")[:-1])
return authors_str


def article_dict(data, **kwargs) -> Dict[str, Any]:
data = merge_dicts(data, kwargs)

summaries = data.pop("summaries", [])
summary = data.pop("summary", None)

data['summaries'] = summaries + [summary] if summary else []
data['authors'] = format_authors(data.pop("authors", []))
data['title'] = normalize_text(data.get('title'))

return dict(
meta={k: v for k, v in data.items() if k not in ARTICLE_MAIN_KEYS and v is not None},
**{k: v for k, v in data.items() if k in ARTICLE_MAIN_KEYS},
)


@dataclass
class AlignmentDataset:
"""The base dataset class."""
Expand Down
60 changes: 60 additions & 0 deletions align_data/common/formatters.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import re
from typing import Any, Dict, List

from align_data.settings import ARTICLE_MAIN_KEYS
from align_data.sources.utils import merge_dicts


def normalize_url(url: str | None) -> str | None:
if not url:
return url

# ending '/'
url = url.rstrip("/")

# Remove http and use https consistently
url = url.replace("http://", "https://")

# Remove www
url = url.replace("https://www.", "https://")

# Remove index.html or index.htm
url = re.sub(r'/index\.html?$', '', url)

# Convert youtu.be links to youtube.com
url = url.replace("https://youtu.be/", "https://youtube.com/watch?v=")

# Additional rules for mirror domains can be added here

# agisafetyfundamentals.com -> aisafetyfundamentals.com
url = url.replace("https://agisafetyfundamentals.com", "https://aisafetyfundamentals.com")

return url


def normalize_text(text: str | None) -> str | None:
return (text or '').replace('\n', ' ').replace('\r', '').strip() or None


def format_authors(authors: List[str]) -> str:
# TODO: Don't keep adding the same authors - come up with some way to reuse them
authors_str = ",".join(authors)
if len(authors_str) > 1024:
authors_str = ",".join(authors_str[:1024].split(",")[:-1])
return authors_str


def article_dict(data, **kwargs) -> Dict[str, Any]:
data = merge_dicts(data, kwargs)

summaries = data.pop("summaries", [])
summary = data.pop("summary", None)

data['summaries'] = summaries + [summary] if summary else []
data['authors'] = format_authors(data.pop("authors", []))
data['title'] = normalize_text(data.get('title'))

return dict(
meta={k: v for k, v in data.items() if k not in ARTICLE_MAIN_KEYS and v is not None},
**{k: v for k, v in data.items() if k in ARTICLE_MAIN_KEYS},
)
3 changes: 2 additions & 1 deletion align_data/sources/articles/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@
from pypandoc import convert_file
from sqlalchemy import select

from align_data.common.alignment_dataset import AlignmentDataset, normalize_url
from align_data.common.alignment_dataset import AlignmentDataset
from align_data.common.formatters import normalize_url
from align_data.db.models import Article
from align_data.sources.articles.google_cloud import fetch_file, fetch_markdown
from align_data.sources.articles.parsers import (
Expand Down
16 changes: 11 additions & 5 deletions align_data/sources/validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@

from tqdm import tqdm
from sqlalchemy.exc import IntegrityError
from align_data.common.formatters import normalize_url, normalize_text, article_dict
from align_data.db.session import make_session
from align_data.db.models import Article
from align_data.common.alignment_dataset import normalize_url, normalize_text, article_dict
from align_data.sources.articles.parsers import item_metadata
from align_data.sources.articles.html import fetch

Expand All @@ -29,8 +29,12 @@ def update_article_field(article: Article, field: str, value: Any):
article.meta = article.meta or {}
for k, v in value.items():
meta_val = article.meta.get(k)
if not meta_val or v > meta_val:
article.meta[k] = v
try:
if not meta_val or v > meta_val:
article.meta[k] = v
except Exception as e:
# Ignore exceptions here - the metadata isn't that important
logger.info('Error checking metadata value: %s', value)
return

article_val = getattr(article, field, None)
Expand All @@ -44,7 +48,8 @@ def update_article_field(article: Article, field: str, value: Any):
setattr(article, field, normalize_text(value))


def check_article(article: Article) -> Article:
def update_article(article: Article) -> Article:
"""Check whether there are better data for this article and whether its url is pointing somewhere decent."""
source_url = article.meta.get('source_url') or article.url
contents = {}
if source_url:
Expand All @@ -66,6 +71,7 @@ def check_article(article: Article) -> Article:


def check_articles(sources: List[str], batch_size=100):
"""Check `batch_size` articles with the given `sources` to see if they have better data."""
logger.info('Checking %s articles for %s', batch_size, ', '.join(sources))
with make_session() as session:
for article in tqdm(
Expand All @@ -75,7 +81,7 @@ def check_articles(sources: List[str], batch_size=100):
.limit(batch_size)
.all()
):
check_article(article)
update_article(article)
session.add(article)
logger.debug('commiting')
try:
Expand Down
2 changes: 1 addition & 1 deletion migrations/versions/cfd1704ad799_date_checked_column.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

# revision identifiers, used by Alembic.
revision = 'cfd1704ad799'
down_revision = 'f5a2bcfa6b2c'
down_revision = '1866340e456a'
branch_labels = None
depends_on = None

Expand Down
14 changes: 7 additions & 7 deletions tests/align_data/sources/test_validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from unittest.mock import patch, Mock

from align_data.db.models import Article
from align_data.sources.validate import update_article_field, check_article, check_articles
from align_data.sources.validate import update_article_field, update_article


@pytest.mark.parametrize('url', (
Expand Down Expand Up @@ -72,7 +72,7 @@ def test_update_article_field_with_new_field_and_empty_value():
assert hasattr(article, 'description') == False


def test_check_article_gets_metadata():
def test_update_article_gets_metadata():
data = {
'text': 'bla bla',
'source_url': 'http://ble.com',
Expand All @@ -82,7 +82,7 @@ def test_check_article_gets_metadata():
article = Article(url='http://this.should.change', meta={})
with patch('align_data.sources.validate.item_metadata', return_value=data):
with patch('align_data.sources.validate.fetch', return_value=Mock(status_code=200)):
check_article(article)
update_article(article)
assert article.to_dict() == {
'authors': ['mr. blobby', 'johhny'],
'date_published': None,
Expand All @@ -97,7 +97,7 @@ def test_check_article_gets_metadata():
}


def test_check_article_no_metadata():
def test_update_article_no_metadata():
data = {
'error': 'this failed!',
'text': 'bla bla',
Expand All @@ -108,7 +108,7 @@ def test_check_article_no_metadata():
article = Article(url='http://this.should.not.change', meta={})
with patch('align_data.sources.validate.item_metadata', return_value=data):
with patch('align_data.sources.validate.fetch', return_value=Mock(status_code=200)):
check_article(article)
update_article(article)
assert article.to_dict() == {
'authors': [],
'date_published': None,
Expand All @@ -122,9 +122,9 @@ def test_check_article_no_metadata():
}


def test_check_article_bad_url():
def test_update_article_bad_url():
article = Article(url='http://this.should.not.change', meta={})
with patch('align_data.sources.validate.item_metadata', return_value={}):
with patch('align_data.sources.validate.fetch', return_value=Mock(status_code=400)):
check_article(article)
update_article(article)
assert article.status == 'Unreachable url'

0 comments on commit 130056a

Please sign in to comment.