Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat add cron jobs for fetching daily articles #84

Merged
merged 3 commits into from
Sep 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion backend/batch_prompts.jsonl

This file was deleted.

1 change: 0 additions & 1 deletion backend/batch_result.jsonl

This file was deleted.

5,896 changes: 0 additions & 5,896 deletions backend/lm_events_output.json

Large diffs are not rendered by default.

145 changes: 145 additions & 0 deletions backend/src/cron/fetch_articles.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
from datetime import datetime
import httpx

from src.common.constants import GUARDIAN_API_KEY
from sqlalchemy import select
from src.events.models import Article, ArticleSource, Event
from src.common.database import engine
from sqlalchemy.orm import Session
from src.scrapers.guardian.process import GuardianArticle, GuardianArticleFields

from src.lm.generate_events import generate_events
from src.scripts.populate import populate
from src.embeddings.vector_store import store_documents


def query_page(page: int, date):
response = httpx.get(
"https://content.guardianapis.com/search",
params={
"api-key": GUARDIAN_API_KEY,
"page-size": 50,
"page": page,
"lang": "en",
"show-fields": ["all"],
"from-date": date,
},
)
response_json = response.json()
data = response_json["response"]
if data["status"] != "ok":
print("something went wrong with page:", page)
return []
return data["results"]


def get_today_articles():
result = []
cur_date = datetime.now().date()
for i in range(1, 11):
new_batch = query_page(i, cur_date)
if len(new_batch) < 50:
print(f"On page {i}, only got {len(new_batch)} articles. Stopping.")
result += new_batch
break
print(f"On page {i}, got {len(new_batch)} articles")
result += new_batch

return result


def form_guardian_article_obj(article: dict):
article_obj = GuardianArticle(
fields=GuardianArticleFields(
bodyText=article["fields"]["bodyText"],
trailText=article["fields"]["trailText"],
thumbnail=article["fields"]["thumbnail"],
),
webUrl=article["webUrl"],
webTitle=article["webTitle"],
webPublicationDate=article["webPublicationDate"],
)
return article_obj


def add_daily_articles_to_db(article: GuardianArticle):
with Session(engine) as session:
query_article = session.scalars(
select(Article).where(
Article.title == article.webTitle,
Article.source == ArticleSource.GUARDIAN,
Article.date == article.webPublicationDate,
Article.url == article.webUrl,
)
).first()

if query_article:
print(f"Article {article.webTitle} already exists in database")
return False

try:
article_orm = Article(
title=article.webTitle,
summary=article.fields.trailText if article.fields.trailText else "",
url=article.webUrl,
source=ArticleSource.GUARDIAN,
body=article.fields.bodyText,
date=article.webPublicationDate,
image_url=article.fields.thumbnail or "",
)
session.add(article_orm)
session.commit()
print(
f"Added {article.webTitle} to database at {article.webPublicationDate}"
)
except Exception as e:
print(f"Something went wrong with article {article.webTitle}")
print(e)
return False


def populate_daily_articles():
articles = get_today_articles()
articles = articles[:1]
for article in articles:
article_obj = form_guardian_article_obj(article)
add_daily_articles_to_db(article_obj)


def process_new_articles() -> list[dict]:
with Session(engine) as session:
result = session.scalars(
select(Article).where(
Article.id.not_in(
list(session.scalars(select(Event.original_article_id)))
)
)
).all()

articles = []

for article in result:
data_dict = {
"id": article.id,
"bodyText": article.body,
}
articles.append(data_dict)

return articles


# NOTE: this method should work with no issue as long as the number of calls is less than 500 which is the rate limit by OpenAI
# This should not be an issue as long as we ensure the 25k articles in the database have already been processed


def run():
# Add new articles to database
populate_daily_articles()
# Process new articles i.e. find articles that we have not generated events for
articles = process_new_articles()
# Generate events from articles, written to lm_events_output.json
generate_events(articles)
# Populate the database with events from lm_events_output.json
populate()
# Store analyses in vector store
store_documents()
43 changes: 3 additions & 40 deletions backend/src/lm/generate_events.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
os.environ["LANGCHAIN_TRACING_V2"] = LANGCHAIN_TRACING_V2
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

lm_model = ChatOpenAI(model="gpt-4o-mini")
lm_model = ChatOpenAI(model="gpt-4o-mini", temperature=0)


class CategoryAnalysis(BaseModel):
Expand Down Expand Up @@ -51,44 +51,6 @@ class EventDetails(BaseModel):
examples: list[Example]


sample_text = """
A seething Mikel Arteta admitted that he was “amazed about how inconsistent the decisions can be” after Declan Rice was sent off as Arsenal dropped their first points of the new season against Brighton.

Arsenal had looked on course to maintain their 100% start to the new campaign when Bukayo Saka pounced on a mistake by Lewis Dunk to set up Kai Havertz. But an incident early in the second half when Rice, who had never been sent off before in his career and will now miss the north London derby against Tottenham after the international break, was shown a second yellow card by referee Chris Kavanagh after being deemed to have obstructed Joël Veltman from taking a free-kick altered the momentum of the game entirely.

Arsenal’s Declan Rice is stunned after being shown a second yellow card by the referee Chris Kavanagh
Brighton’s João Pedro pegs back Arsenal for point after Declan Rice sees red
Read more
João Pedro went on to equalise and maintain Brighton’s unbeaten start under Fabian Hürzeler. But Arteta said that he had been stunned by Kavanagh’s decision not to take any action against Veltman for making contact with Rice and with another incident in the first half when Pedro kicked the ball away.

“I was amazed. Amazed, amazed, amazed because of how inconsistent decisions can be,” he said.

“In the first half, there are two incidents and nothing happens.

“Then, in a non-critical area, the ball hits Declan, he turns around, he doesn’t see the player coming and he touches the ball.

“By law, he can make that call, but then by law he needs to make the next call, which is a red card so we play 10 v 10. This is what amazed me. At this level it’s amazing.”

Rice said later he was “shocked … I think you could see that on my face. But this is the laws of the game. If you touch the ball even a little bit it’s a red card after my challenge in the first half. It was tough, it was harsh but I have to move on from it.”

Hürzeler, the 31-year-old who replaced Roberto De Zerbi in the Brighton dugout this summer, felt Kavanagh had made the right decision. “For me it was a clear red card,” he said. “He shoots the ball away – it’s wasting time.”

skip past newsletter promotion
Sign up to Football Daily

Free daily newsletter
Kick off your evenings with the Guardian's take on the world of football

Enter your email address
Sign up
Privacy Notice: Newsletters may contain info about charities, online ads, and content funded by outside parties. For more information see our Privacy Policy. We use Google reCaptcha to protect our website and the Google Privacy Policy and Terms of Service apply.
after newsletter promotion
Arsenal still had an opportunity to take all three points but Havertz and Saka both spurned late opportunities and Arteta’s side now face the prospect of a trip to Tottenham without Rice or new signing Mikel Merino, who injured his shoulder in his first training session this week. “This is what happens. We have to adapt to that context,” said the Arsenal manager.

“That’s why we have other players that can fulfil that [role] and give that opportunity to somebody else.

“But the team reacted to what we had to do playing at home with 10 men. We didn’t want to be so deep defending like this, but we read the game and we played the game that we had to play and we should have got rewarded.”"""

file_path = "lm_events_output.json"


Expand All @@ -114,7 +76,7 @@ def form_event_json(event_details, article) -> dict:
description=event_details.get("description", ""),
analysis_list=event_details.get("analysis_list", {}),
duplicate=False,
date="",
date=str(article.get("webPublicationDate")),
is_singapore=event_details.get("in_singapore", False),
categories=event_details.get("category", []),
original_article_id=article.get("id"),
Expand All @@ -134,6 +96,7 @@ def generate_events_from_article(article: dict) -> dict:
result = lm_model.invoke(messages)
parser = JsonOutputParser(pydantic_object=EventDetails)
events = parser.invoke(result)
print(f"Model temp: {lm_model.temperature}")
return events


Expand Down
4 changes: 2 additions & 2 deletions backend/src/scrapers/guardian/get_articles.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@
from src.events.models import Article


def get_articles():
def get_articles() -> list[dict]:
with Session(engine) as session:
# Select the first 5 articles
result = session.scalars(select(Article).limit(100))
result = session.scalars(select(Article).limit(3))

articles = []
# Iterate over the result and print each article
Expand Down
33 changes: 17 additions & 16 deletions backend/src/scrapers/guardian/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,19 +27,20 @@ class GuardianArticle(BaseModel):
webPublicationDate: str


with open(args.input) as f:
data = json.load(f)
for row in data:
article = GuardianArticle.model_validate(row)
article_orm = Article(
title=article.webTitle,
summary=article.fields.trailText if article.fields.trailText else "",
url=article.webUrl,
source=ArticleSource.GUARDIAN,
body=article.fields.bodyText,
date=article.webPublicationDate,
image_url=article.fields.thumbnail or "",
)
with Session(engine) as session:
session.add(article_orm)
session.commit()
def populate_existing_articles():
with open(args.input) as f:
data = json.load(f)
for row in data:
article = GuardianArticle.model_validate(row)
article_orm = Article(
title=article.webTitle,
summary=article.fields.trailText if article.fields.trailText else "",
url=article.webUrl,
source=ArticleSource.GUARDIAN,
body=article.fields.bodyText,
date=article.webPublicationDate,
image_url=article.fields.thumbnail or "",
)
with Session(engine) as session:
session.add(article_orm)
session.commit()
Loading