Skip to content

Commit

Permalink
feat: add script to pull articles daily
Browse files Browse the repository at this point in the history
  • Loading branch information
marcus-ny committed Sep 25, 2024
1 parent ce31644 commit 6bc840c
Show file tree
Hide file tree
Showing 4 changed files with 123 additions and 18 deletions.
1 change: 0 additions & 1 deletion backend/batch_prompts.jsonl

This file was deleted.

1 change: 0 additions & 1 deletion backend/batch_result.jsonl

This file was deleted.

106 changes: 106 additions & 0 deletions backend/src/cron/fetch_articles.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
from datetime import datetime
import httpx

from src.common.constants import GUARDIAN_API_KEY
from sqlalchemy import select
from src.events.models import Article, ArticleSource
from src.common.database import engine
from sqlalchemy.orm import Session
from src.scrapers.guardian.process import GuardianArticle, GuardianArticleFields


def query_page(page: int, date):
response = httpx.get(
"https://content.guardianapis.com/search",
params={
"api-key": GUARDIAN_API_KEY,
"page-size": 50,
"page": page,
"lang": "en",
"show-fields": ["all"],
"from-date": date,
},
)
response_json = response.json()
data = response_json["response"]
if data["status"] != "ok":
print("something went wrong with page:", page)
return []
return data["results"]


def get_today_articles():
result = []
cur_date = datetime.now().date()
for i in range(1, 11):
new_batch = query_page(i, cur_date)
if len(new_batch) < 50:
print(f"On page {i}, only got {len(new_batch)} articles. Stopping.")
result += new_batch
break
print(f"On page {i}, got {len(new_batch)} articles")
result += new_batch

return result


def form_guardian_artilcle_obj(article: dict):
article_obj = GuardianArticle(
fields=GuardianArticleFields(
bodyText=article["fields"]["bodyText"],
trailText=article["fields"]["trailText"],
thumbnail=article["fields"]["thumbnail"],
),
webUrl=article["webUrl"],
webTitle=article["webTitle"],
webPublicationDate=article["webPublicationDate"],
)
return article_obj


def add_daily_articles_to_db(article: GuardianArticle):
with Session(engine) as session:
query_article = session.scalars(
select(Article).where(
Article.title == article.webTitle,
Article.source == ArticleSource.GUARDIAN,
Article.date == article.webPublicationDate,
Article.url == article.webUrl,
)
).first()

if query_article:
print(f"Article {article.webTitle} already exists in database")
return False

try:
article_orm = Article(
title=article.webTitle,
summary=article.fields.trailText if article.fields.trailText else "",
url=article.webUrl,
source=ArticleSource.GUARDIAN,
body=article.fields.bodyText,
date=article.webPublicationDate,
image_url=article.fields.thumbnail or "",
)
session.add(article_orm)
session.commit()
print(
f"Added {article.webTitle} to database at {article.webPublicationDate}"
)
except Exception as e:
print(f"Something went wrong with article {article.webTitle}")
print(e)
return False


def populate_daily_articles():
articles = get_today_articles()
articles = articles[:1]
for article in articles:
article_obj = form_guardian_artilcle_obj(article)
add_daily_articles_to_db(article_obj)


if __name__ == "__main__":
populate_daily_articles()
33 changes: 17 additions & 16 deletions backend/src/scrapers/guardian/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,19 +27,20 @@ class GuardianArticle(BaseModel):
webPublicationDate: str


with open(args.input) as f:
data = json.load(f)
for row in data:
article = GuardianArticle.model_validate(row)
article_orm = Article(
title=article.webTitle,
summary=article.fields.trailText if article.fields.trailText else "",
url=article.webUrl,
source=ArticleSource.GUARDIAN,
body=article.fields.bodyText,
date=article.webPublicationDate,
image_url=article.fields.thumbnail or "",
)
with Session(engine) as session:
session.add(article_orm)
session.commit()
def populate_existing_articles():
with open(args.input) as f:
data = json.load(f)
for row in data:
article = GuardianArticle.model_validate(row)
article_orm = Article(
title=article.webTitle,
summary=article.fields.trailText if article.fields.trailText else "",
url=article.webUrl,
source=ArticleSource.GUARDIAN,
body=article.fields.bodyText,
date=article.webPublicationDate,
image_url=article.fields.thumbnail or "",
)
with Session(engine) as session:
session.add(article_orm)
session.commit()

0 comments on commit 6bc840c

Please sign in to comment.