Skip to content

Commit

Permalink
filter out rows with empty text
Browse files Browse the repository at this point in the history
  • Loading branch information
mruwnik committed Aug 15, 2023
1 parent 0519655 commit dfd853a
Show file tree
Hide file tree
Showing 4 changed files with 40 additions and 8 deletions.
6 changes: 6 additions & 0 deletions align_data/common/alignment_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,12 @@ def fetch_entries(self):
if not entry:
continue

try:
entry.verify_id_fields()
except AssertionError as e:
logger.error(e)
continue

yield entry

if self.COOLDOWN:
Expand Down
21 changes: 21 additions & 0 deletions align_data/db/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,12 @@
)
from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column, relationship
from sqlalchemy.dialects.mysql import LONGTEXT
from sqlalchemy.ext.hybrid import hybrid_property
from align_data.settings import PINECONE_METADATA_KEYS


logger = logging.getLogger(__name__)
OK_STATUS = None


class Base(DeclarativeBase):
Expand Down Expand Up @@ -131,6 +133,25 @@ def add_meta(self, key, val):
self.meta = {}
self.meta[key] = val

@hybrid_property
def is_valid(self):
return (
self.text and self.text.strip() and
self.url and self.title and
self.authors is not None and
self.status == OK_STATUS
)

@is_valid.expression
def is_valid(cls):
return (
(cls.status == OK_STATUS)
& (cls.text != None)
& (cls.url != None)
& (cls.title != None)
& (cls.authors != None)
)

@classmethod
def before_write(cls, mapper, connection, target):
target.verify_id_fields()
Expand Down
10 changes: 7 additions & 3 deletions align_data/db/session.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import logging

from contextlib import contextmanager
from sqlalchemy import create_engine
from sqlalchemy import create_engine, and_
from sqlalchemy.orm import Session
from align_data.settings import DB_CONNECTION_URI
from align_data.db.models import Article
Expand All @@ -23,5 +23,9 @@ def make_session(auto_commit=False):
def stream_pinecone_updates(session, custom_sources: List[str]):
"""Yield Pinecone entries that require an update."""
yield from session.query(Article).filter(
Article.pinecone_update_required.is_(True)
).filter(Article.source.in_(custom_sources)).yield_per(1000)
Article.pinecone_update_required.is_(True),
).filter(
Article.is_valid
).filter(
Article.source.in_(custom_sources)
).yield_per(1000)
11 changes: 6 additions & 5 deletions align_data/sources/articles/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from urllib.parse import urlparse

import pandas as pd
from align_data.sources.articles import articles
from gdown.download import download
from markdownify import markdownify
from pypandoc import convert_file
Expand Down Expand Up @@ -123,16 +124,12 @@ def not_processed(self, item):
)

def process_entry(self, item):
if parse_domain(item.url) == "arxiv.org":
if ArxivPapers.is_arxiv(item.url):
contents = ArxivPapers.get_contents(item)
contents['source'] = 'arxiv'
else:
contents = self.get_contents(item)

# Skip items that can't be saved because missing fields
if not all(contents.get(key) for key in ['title', 'url']):
return None

return self.make_data_entry(contents)


Expand Down Expand Up @@ -218,6 +215,10 @@ def _get_text(self, item):
class ArxivPapers(SpreadsheetDataset):
COOLDOWN: int = 1

@staticmethod
def is_arxiv(url):
return parse_domain(url) == "arxiv.org"

@classmethod
def get_contents(cls, item) -> Dict:
contents = fetch_arxiv(item.url or item.source_url)
Expand Down

0 comments on commit dfd853a

Please sign in to comment.