From dfd853a7965d5b5a753d12a45a360ae7599e86c7 Mon Sep 17 00:00:00 2001 From: Daniel O'Connell Date: Tue, 15 Aug 2023 14:32:13 +0200 Subject: [PATCH] filter out rows with empty text --- align_data/common/alignment_dataset.py | 6 ++++++ align_data/db/models.py | 21 +++++++++++++++++++++ align_data/db/session.py | 10 +++++++--- align_data/sources/articles/datasets.py | 11 ++++++----- 4 files changed, 40 insertions(+), 8 deletions(-) diff --git a/align_data/common/alignment_dataset.py b/align_data/common/alignment_dataset.py index 8bbd224e..5ff55f63 100644 --- a/align_data/common/alignment_dataset.py +++ b/align_data/common/alignment_dataset.py @@ -202,6 +202,12 @@ def fetch_entries(self): if not entry: continue + try: + entry.verify_id_fields() + except AssertionError as e: + logger.error(e) + continue + yield entry if self.COOLDOWN: diff --git a/align_data/db/models.py b/align_data/db/models.py index afbd27fd..6cb3878f 100644 --- a/align_data/db/models.py +++ b/align_data/db/models.py @@ -16,10 +16,12 @@ ) from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column, relationship from sqlalchemy.dialects.mysql import LONGTEXT +from sqlalchemy.ext.hybrid import hybrid_property from align_data.settings import PINECONE_METADATA_KEYS logger = logging.getLogger(__name__) +OK_STATUS = None class Base(DeclarativeBase): @@ -131,6 +133,25 @@ def add_meta(self, key, val): self.meta = {} self.meta[key] = val + @hybrid_property + def is_valid(self): + return ( + self.text and self.text.strip() and + self.url and self.title and + self.authors is not None and + self.status == OK_STATUS + ) + + @is_valid.expression + def is_valid(cls): + return ( + (cls.status == OK_STATUS) + & (cls.text != None) + & (cls.url != None) + & (cls.title != None) + & (cls.authors != None) + ) + @classmethod def before_write(cls, mapper, connection, target): target.verify_id_fields() diff --git a/align_data/db/session.py b/align_data/db/session.py index ace0ff8a..55e03e53 100644 --- a/align_data/db/session.py +++ b/align_data/db/session.py @@ -2,7 +2,7 @@ import logging from contextlib import contextmanager -from sqlalchemy import create_engine +from sqlalchemy import create_engine, and_ from sqlalchemy.orm import Session from align_data.settings import DB_CONNECTION_URI from align_data.db.models import Article @@ -23,5 +23,9 @@ def make_session(auto_commit=False): def stream_pinecone_updates(session, custom_sources: List[str]): """Yield Pinecone entries that require an update.""" yield from session.query(Article).filter( - Article.pinecone_update_required.is_(True) - ).filter(Article.source.in_(custom_sources)).yield_per(1000) + Article.pinecone_update_required.is_(True), + ).filter( + Article.is_valid + ).filter( + Article.source.in_(custom_sources) + ).yield_per(1000) diff --git a/align_data/sources/articles/datasets.py b/align_data/sources/articles/datasets.py index d032903f..f8e2c897 100644 --- a/align_data/sources/articles/datasets.py +++ b/align_data/sources/articles/datasets.py @@ -6,6 +6,7 @@ from urllib.parse import urlparse import pandas as pd +from align_data.sources.articles import articles from gdown.download import download from markdownify import markdownify from pypandoc import convert_file @@ -123,16 +124,12 @@ def not_processed(self, item): ) def process_entry(self, item): - if parse_domain(item.url) == "arxiv.org": + if ArxivPapers.is_arxiv(item.url): contents = ArxivPapers.get_contents(item) contents['source'] = 'arxiv' else: contents = self.get_contents(item) - # Skip items that can't be saved because missing fields - if not all(contents.get(key) for key in ['title', 'url']): - return None - return self.make_data_entry(contents) @@ -218,6 +215,10 @@ def _get_text(self, item): class ArxivPapers(SpreadsheetDataset): COOLDOWN: int = 1 + @staticmethod + def is_arxiv(url): + return parse_domain(url) == "arxiv.org" + @classmethod def get_contents(cls, item) -> Dict: contents = fetch_arxiv(item.url or item.source_url)