Merge pull request #84 from cs3216-a3-group-4/feat-add-cron-jobs-for-…

…fetching-daily-articles Feat add cron jobs for fetching daily articles
jippy-the-frog · Sep 25, 2024 · 80b2646 · 80b2646
2 parents c8bf3ce + 21359f2
commit 80b2646
Show file tree

Hide file tree

Showing 7 changed files with 167 additions and 5,956 deletions.
diff --git a/backend/batch_prompts.jsonl b/backend/batch_prompts.jsonl
diff --git a/backend/batch_result.jsonl b/backend/batch_result.jsonl
diff --git a/backend/lm_events_output.json b/backend/lm_events_output.json
diff --git a/backend/src/cron/fetch_articles.py b/backend/src/cron/fetch_articles.py
@@ -0,0 +1,145 @@
+from datetime import datetime
+import httpx
+
+from src.common.constants import GUARDIAN_API_KEY
+from sqlalchemy import select
+from src.events.models import Article, ArticleSource, Event
+from src.common.database import engine
+from sqlalchemy.orm import Session
+from src.scrapers.guardian.process import GuardianArticle, GuardianArticleFields
+
+from src.lm.generate_events import generate_events
+from src.scripts.populate import populate
+from src.embeddings.vector_store import store_documents
+
+
+def query_page(page: int, date):
+    response = httpx.get(
+        "https://content.guardianapis.com/search",
+        params={
+            "api-key": GUARDIAN_API_KEY,
+            "page-size": 50,
+            "page": page,
+            "lang": "en",
+            "show-fields": ["all"],
+            "from-date": date,
+        },
+    )
+    response_json = response.json()
+    data = response_json["response"]
+    if data["status"] != "ok":
+        print("something went wrong with page:", page)
+        return []
+    return data["results"]
+
+
+def get_today_articles():
+    result = []
+    cur_date = datetime.now().date()
+    for i in range(1, 11):
+        new_batch = query_page(i, cur_date)
+        if len(new_batch) < 50:
+            print(f"On page {i}, only got {len(new_batch)} articles. Stopping.")
+            result += new_batch
+            break
+        print(f"On page {i}, got {len(new_batch)} articles")
+        result += new_batch
+
+    return result
+
+
+def form_guardian_article_obj(article: dict):
+    article_obj = GuardianArticle(
+        fields=GuardianArticleFields(
+            bodyText=article["fields"]["bodyText"],
+            trailText=article["fields"]["trailText"],
+            thumbnail=article["fields"]["thumbnail"],
+        ),
+        webUrl=article["webUrl"],
+        webTitle=article["webTitle"],
+        webPublicationDate=article["webPublicationDate"],
+    )
+    return article_obj
+
+
+def add_daily_articles_to_db(article: GuardianArticle):
+    with Session(engine) as session:
+        query_article = session.scalars(
+            select(Article).where(
+                Article.title == article.webTitle,
+                Article.source == ArticleSource.GUARDIAN,
+                Article.date == article.webPublicationDate,
+                Article.url == article.webUrl,
+            )
+        ).first()
+
+        if query_article:
+            print(f"Article {article.webTitle} already exists in database")
+            return False
+
+        try:
+            article_orm = Article(
+                title=article.webTitle,
+                summary=article.fields.trailText if article.fields.trailText else "",
+                url=article.webUrl,
+                source=ArticleSource.GUARDIAN,
+                body=article.fields.bodyText,
+                date=article.webPublicationDate,
+                image_url=article.fields.thumbnail or "",
+            )
+            session.add(article_orm)
+            session.commit()
+            print(
+                f"Added {article.webTitle} to database at {article.webPublicationDate}"
+            )
+        except Exception as e:
+            print(f"Something went wrong with article {article.webTitle}")
+            print(e)
+            return False
+
+
+def populate_daily_articles():
+    articles = get_today_articles()
+    articles = articles[:1]
+    for article in articles:
+        article_obj = form_guardian_article_obj(article)
+        add_daily_articles_to_db(article_obj)
+
+
+def process_new_articles() -> list[dict]:
+    with Session(engine) as session:
+        result = session.scalars(
+            select(Article).where(
+                Article.id.not_in(
+                    list(session.scalars(select(Event.original_article_id)))
+                )
+            )
+        ).all()
+
+        articles = []
+
+        for article in result:
+            data_dict = {
+                "id": article.id,
+                "bodyText": article.body,
+            }
+            articles.append(data_dict)
+
+        return articles
+
+
+# NOTE: this method should work with no issue as long as the number of calls is less than 500 which is the rate limit by OpenAI
+# This should not be an issue as long as we ensure the 25k articles in the database have already been processed
+
+
+def run():
+    # Add new articles to database
+    populate_daily_articles()
+    # Process new articles i.e. find articles that we have not generated events for
+    articles = process_new_articles()
+    # Generate events from articles, written to lm_events_output.json
+    generate_events(articles)
+    # Populate the database with events from lm_events_output.json
+    populate()
+    # Store analyses in vector store
+    store_documents()
diff --git a/backend/src/lm/generate_events.py b/backend/src/lm/generate_events.py
@@ -16,7 +16,7 @@
 os.environ["LANGCHAIN_TRACING_V2"] = LANGCHAIN_TRACING_V2
 os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
 
-lm_model = ChatOpenAI(model="gpt-4o-mini")
+lm_model = ChatOpenAI(model="gpt-4o-mini", temperature=0)
 
 
 class CategoryAnalysis(BaseModel):
@@ -51,44 +51,6 @@ class EventDetails(BaseModel):
     examples: list[Example]
 
 
-sample_text = """
-A seething Mikel Arteta admitted that he was “amazed about how inconsistent the decisions can be” after Declan Rice was sent off as Arsenal dropped their first points of the new season against Brighton.
-
-Arsenal had looked on course to maintain their 100% start to the new campaign when Bukayo Saka pounced on a mistake by Lewis Dunk to set up Kai Havertz. But an incident early in the second half when Rice, who had never been sent off before in his career and will now miss the north London derby against Tottenham after the international break, was shown a second yellow card by referee Chris Kavanagh after being deemed to have obstructed Joël Veltman from taking a free-kick altered the momentum of the game entirely.
-
-Arsenal’s Declan Rice is stunned after being shown a second yellow card by the referee Chris Kavanagh
-Brighton’s João Pedro pegs back Arsenal for point after Declan Rice sees red
-Read more
-João Pedro went on to equalise and maintain Brighton’s unbeaten start under Fabian Hürzeler. But Arteta said that he had been stunned by Kavanagh’s decision not to take any action against Veltman for making contact with Rice and with another incident in the first half when Pedro kicked the ball away.
-
-“I was amazed. Amazed, amazed, amazed because of how inconsistent decisions can be,” he said.
-
-“In the first half, there are two incidents and nothing happens.
-
-“Then, in a non-critical area, the ball hits Declan, he turns around, he doesn’t see the player coming and he touches the ball.
-
-“By law, he can make that call, but then by law he needs to make the next call, which is a red card so we play 10 v 10. This is what amazed me. At this level it’s amazing.”
-
-Rice said later he was “shocked … I think you could see that on my face. But this is the laws of the game. If you touch the ball even a little bit it’s a red card after my challenge in the first half. It was tough, it was harsh but I have to move on from it.”
-
-Hürzeler, the 31-year-old who replaced Roberto De Zerbi in the Brighton dugout this summer, felt Kavanagh had made the right decision. “For me it was a clear red card,” he said. “He shoots the ball away – it’s wasting time.”
-
-skip past newsletter promotion
-Sign up to Football Daily
-
-Free daily newsletter
-Kick off your evenings with the Guardian's take on the world of football
-
-Enter your email address
-Sign up
-Privacy Notice: Newsletters may contain info about charities, online ads, and content funded by outside parties. For more information see our Privacy Policy. We use Google reCaptcha to protect our website and the Google Privacy Policy and Terms of Service apply.
-after newsletter promotion
-Arsenal still had an opportunity to take all three points but Havertz and Saka both spurned late opportunities and Arteta’s side now face the prospect of a trip to Tottenham without Rice or new signing Mikel Merino, who injured his shoulder in his first training session this week. “This is what happens. We have to adapt to that context,” said the Arsenal manager.
-
-“That’s why we have other players that can fulfil that [role] and give that opportunity to somebody else.
-
-“But the team reacted to what we had to do playing at home with 10 men. We didn’t want to be so deep defending like this, but we read the game and we played the game that we had to play and we should have got rewarded.”"""
-
 file_path = "lm_events_output.json"
 
 
@@ -114,7 +76,7 @@ def form_event_json(event_details, article) -> dict:
         description=event_details.get("description", ""),
         analysis_list=event_details.get("analysis_list", {}),
         duplicate=False,
-        date="",
+        date=str(article.get("webPublicationDate")),
         is_singapore=event_details.get("in_singapore", False),
         categories=event_details.get("category", []),
         original_article_id=article.get("id"),
@@ -134,6 +96,7 @@ def generate_events_from_article(article: dict) -> dict:
     result = lm_model.invoke(messages)
     parser = JsonOutputParser(pydantic_object=EventDetails)
     events = parser.invoke(result)
+    print(f"Model temp: {lm_model.temperature}")
     return events
 
 

diff --git a/backend/src/scrapers/guardian/get_articles.py b/backend/src/scrapers/guardian/get_articles.py
@@ -4,10 +4,10 @@
 from src.events.models import Article
 
 
-def get_articles():
+def get_articles() -> list[dict]:
     with Session(engine) as session:
         # Select the first 5 articles
-        result = session.scalars(select(Article).limit(100))
+        result = session.scalars(select(Article).limit(3))
 
         articles = []
         # Iterate over the result and print each article

diff --git a/backend/src/scrapers/guardian/process.py b/backend/src/scrapers/guardian/process.py
@@ -27,19 +27,20 @@ class GuardianArticle(BaseModel):
     webPublicationDate: str
 
 
-with open(args.input) as f:
-    data = json.load(f)
-    for row in data:
-        article = GuardianArticle.model_validate(row)
-        article_orm = Article(
-            title=article.webTitle,
-            summary=article.fields.trailText if article.fields.trailText else "",
-            url=article.webUrl,
-            source=ArticleSource.GUARDIAN,
-            body=article.fields.bodyText,
-            date=article.webPublicationDate,
-            image_url=article.fields.thumbnail or "",
-        )
-        with Session(engine) as session:
-            session.add(article_orm)
-            session.commit()
+def populate_existing_articles():
+    with open(args.input) as f:
+        data = json.load(f)
+        for row in data:
+            article = GuardianArticle.model_validate(row)
+            article_orm = Article(
+                title=article.webTitle,
+                summary=article.fields.trailText if article.fields.trailText else "",
+                url=article.webUrl,
+                source=ArticleSource.GUARDIAN,
+                body=article.fields.bodyText,
+                date=article.webPublicationDate,
+                image_url=article.fields.thumbnail or "",
+            )
+            with Session(engine) as session:
+                session.add(article_orm)
+                session.commit()