Merge pull request #95 from cs3216-a3-group-4/seeleng/add-start-date-…

…for-cna-scraper feat: add start date for cna scraper
jippy-the-frog · Sep 26, 2024 · 6a32c82 · 6a32c82
2 parents a519cba + 4034677
commit 6a32c82
Show file tree

Hide file tree

Showing 13 changed files with 100 additions and 6,419 deletions.
diff --git a/backend/src/cron/fetch_articles.py b/backend/src/cron/fetch_articles.py
@@ -1,12 +1,15 @@
 import asyncio
-from datetime import datetime, timedelta
+from datetime import datetime, timedelta, time
 import httpx
+import os
 
 from src.common.constants import GUARDIAN_API_KEY
 from sqlalchemy import select
 from src.events.models import Article, ArticleSource, Event
 from src.common.database import engine
 from sqlalchemy.orm import Session
+from src.scrapers.cna.process import process_all_categories
+from src.scrapers.cna.scrape import scrape_from_date
 from src.scrapers.guardian.get_articles import get_articles
 from src.scrapers.guardian.process import GuardianArticle, GuardianArticleFields
 
@@ -108,6 +111,20 @@ def populate_daily_articles():
         add_daily_articles_to_db(article_obj)
 
 
+async def populate_daily_articles_cna():
+    # create articles folder if doesnt exist
+    if "articles" not in os.listdir("./src/scrapers/cna"):
+        os.mkdir("./src/scrapers/cna/articles")
+
+    yesterday = datetime.combine(datetime.now() - timedelta(days=1), time.min)
+
+    # this function doesnt care about duplicates and just destroys the json
+    await scrape_from_date(start_date=yesterday)
+    # this function already checks for articles not in db that are in json
+    # may salvage the broken json
+    await process_all_categories()
+
+
 def process_new_articles() -> list[dict]:
     with Session(engine) as session:
         result = session.scalars(
@@ -136,7 +153,7 @@ def process_new_articles() -> list[dict]:
 
 async def run(limit: int = 30):
     # Add new articles to database
-    # populate_daily_articles()
+    await populate_daily_articles_cna()
     # ADD CNA HERE.
     # Process new articles i.e. find articles that we have not generated events for
     articles = get_articles(limit)

diff --git a/backend/src/scrapers/cna/data/Asia.json b/backend/src/scrapers/cna/data/Asia.json
diff --git a/backend/src/scrapers/cna/data/Business.json b/backend/src/scrapers/cna/data/Business.json
diff --git a/backend/src/scrapers/cna/data/CNA Explains.json b/backend/src/scrapers/cna/data/CNA Explains.json
diff --git a/backend/src/scrapers/cna/data/CNA Insider.json b/backend/src/scrapers/cna/data/CNA Insider.json
diff --git a/backend/src/scrapers/cna/data/Commentary.json b/backend/src/scrapers/cna/data/Commentary.json
diff --git a/backend/src/scrapers/cna/data/East Asia.json b/backend/src/scrapers/cna/data/East Asia.json
diff --git a/backend/src/scrapers/cna/data/Singapore.json b/backend/src/scrapers/cna/data/Singapore.json
diff --git a/backend/src/scrapers/cna/data/Sport.json b/backend/src/scrapers/cna/data/Sport.json
diff --git a/backend/src/scrapers/cna/data/World.json b/backend/src/scrapers/cna/data/World.json
diff --git a/backend/src/scrapers/cna/process.py b/backend/src/scrapers/cna/process.py
@@ -1,6 +1,7 @@
 import argparse
 import asyncio
 import json
+from sqlalchemy import select
 from sqlalchemy.orm import Session
 from src.events.models import Article, ArticleSource
 from src.common.database import engine
@@ -49,6 +50,10 @@ class CNAArticle(BaseModel):
 async def process(category: str):
     with open(f"./src/scrapers/cna/data/{category}.json") as f:
         data = json.load(f)
+
+    with Session(engine) as session:
+        urls = set(session.scalars(select(Article.url)))
+    count = 0
     for page_index, page in enumerate(data):
         for index, item in enumerate(page):
             try:
@@ -58,8 +63,19 @@ async def process(category: str):
                     continue
                 if article.uuid in processed_ids:
                     continue
+                if article.absolute_url in urls:
+                    continue
                 processed_ids.add(article.uuid)
 
+                # check again in case there are duplicates
+                # probably a slight race condition here
+                with Session(engine) as session:
+                    article_orm = session.scalar(
+                        select(Article).where(Article.url == article.absolute_url)
+                    )
+                    if article_orm:
+                        continue
+
                 # Read body text from scrape.py
                 with open(
                     os.path.join(folder_path, f"{article.uuid}_{category}.txt")
@@ -84,10 +100,12 @@ async def process(category: str):
                 with Session(engine) as session:
                     session.add(article_orm)
                     session.commit()
+                    count += 1
 
             except Exception as e:
                 print(f"{category}: something went wrong with {page_index}, {index}")
                 print(e)
+    print(f"Added {count} articles for {category}")
 
 
 async def process_all_categories():

diff --git a/backend/src/scrapers/cna/scrape.py b/backend/src/scrapers/cna/scrape.py
@@ -1,9 +1,15 @@
+from datetime import datetime
 from bs4 import BeautifulSoup
+from sqlalchemy import select
+from sqlalchemy.orm import Session
+from src.common.database import engine
 import httpx
 import random
 import json
 import asyncio
 
+from src.events.models import Article
+
 
 def get_url(category_slug: str, page: int):
     return f"https://www.channelnewsasia.com/api/v1/infinitelisting/{category_slug}?_format=json&viewMode=infinite_scroll_listing&page={page}"
@@ -22,25 +28,46 @@ def get_url(category_slug: str, page: int):
     # Sustainability section does not have a view more button in CNA
 }
 
+with Session(engine) as session:
+    urls = set(session.scalars(select(Article.url)))
+
 
-async def scrape(category: str, pages: int = 10):
+async def scrape(category: str, pages: int = 10, start_date: datetime = None):
     slug = CATEGORIES[category]
-    data = []
+    with open(f"./src/scrapers/cna/data/{category}.json") as f:
+        data = json.load(f)
     error_count = 0
     async with httpx.AsyncClient() as client:
         for i in range(pages):
             url = get_url(slug, i)
             try:
                 resp = (await client.get(url)).json()
-                data.append(resp["result"])
+                result = resp["result"]
+                terminate = False
+                if start_date:
+                    result = [
+                        row
+                        for row in result
+                        if datetime.fromisoformat(row["date"].split("T")[0])
+                        >= start_date
+                    ]
+                    if len(result) != len(resp["result"]):
+                        terminate = True
+                    # eliminate duplicates
+                    result = [row for row in result if row["absolute_url"] not in urls]
+                data.append(result)
+                if terminate:
+                    print("Reached end of date limit", i, category)
+                    break
                 await asyncio.sleep(3 + (4 * random.random()))
                 # save every 10 pages in case something bad happens
                 error_count = 0
                 if i % 10 == 0:
                     print(f"{category}: {i}")
                     with open(f"./src/scrapers/cna/data/{category}.json", "w") as f:
                         json.dump(data, f)
-            except:  # noqa: E722
+            except Exception as e:  # noqa: E722
+                print(e)
                 print(
                     f"Something went wrong for {category}, {i}. Might have ran out of pages."
                 )
@@ -51,12 +78,14 @@ async def scrape(category: str, pages: int = 10):
                     print(f"Terminated - {category}")
                     return
 
-    with open(f"./src/scrapers/data/{category}.json", "w") as f:
+    with open(f"./src/scrapers/cna/data/{category}.json", "w") as f:
         json.dump(data, f)
 
 
-async def scrape_index():
-    await asyncio.gather(*[scrape(category, 200) for category in CATEGORIES])
+async def scrape_index(start_date: datetime = None):
+    await asyncio.gather(
+        *[scrape(category, 200, start_date) for category in CATEGORIES]
+    )
 
 
 async def scrape_single_page(url):
@@ -75,17 +104,18 @@ async def scrape_single_page(url):
 scraped_slugs = set()
 
 
-async def scrape_category(category: str):
+async def scrape_category(category: str, start_date: datetime = None):
     with open(f"./src/scrapers/cna/data/{category}.json") as f:
         data = json.load(f)
     skipped = 0
 
     for index, page in enumerate(data):
-        if category == "Asia" and index <= 58:
-            continue
         for item in page:
             if item["type"] != "article":
                 continue
+            date = item["date"]
+            if start_date and datetime.fromisoformat(date.split("T")[0]) < start_date:
+                continue
             try:
                 absolute_url = item["absolute_url"]
                 if absolute_url in scraped_slugs:
@@ -103,15 +133,16 @@ async def scrape_category(category: str):
         print(f"scraped: {category}, {index}(x10)")
 
 
-async def scrape_all_categories():
-    asyncio.gather(*[scrape_category(category) for category in CATEGORIES])
+async def scrape_all_categories(start_date: datetime = None):
+    await asyncio.gather(
+        *[scrape_category(category, start_date=start_date) for category in CATEGORIES]
+    )
+
+
+async def scrape_from_date(start_date: datetime):
+    await scrape_index(start_date=start_date)
+    await scrape_all_categories(start_date=start_date)
 
 
 if __name__ == "__main__":
-    # asyncio.run(scrape_index())
-    # asyncio.run(
-    #     scrape_single_page(
-    #         "https://www.channelnewsasia.com/experiences/world-50-best-hotels-2024-4614831"
-    #     )
-    # )
-    asyncio.run(scrape_all_categories())
+    asyncio.run(scrape_from_date(start_date=datetime(2024, 9, 20)))
diff --git a/backend/src/scripts/populate.py b/backend/src/scripts/populate.py
@@ -26,3 +26,7 @@ def set_up():
     populate()
     # store analyses in vector store
     store_documents()
+
+
+if __name__ == "__main__":
+    set_up()