Skip to content

Commit

Permalink
feat: add start date for cna scraper
Browse files Browse the repository at this point in the history
  • Loading branch information
seelengxd committed Sep 26, 2024
1 parent 5ed54ef commit 4034677
Show file tree
Hide file tree
Showing 13 changed files with 100 additions and 6,419 deletions.
21 changes: 19 additions & 2 deletions backend/src/cron/fetch_articles.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
import asyncio
from datetime import datetime, timedelta
from datetime import datetime, timedelta, time
import httpx
import os

from src.common.constants import GUARDIAN_API_KEY
from sqlalchemy import select
from src.events.models import Article, ArticleSource, Event
from src.common.database import engine
from sqlalchemy.orm import Session
from src.scrapers.cna.process import process_all_categories
from src.scrapers.cna.scrape import scrape_from_date
from src.scrapers.guardian.get_articles import get_articles
from src.scrapers.guardian.process import GuardianArticle, GuardianArticleFields

Expand Down Expand Up @@ -108,6 +111,20 @@ def populate_daily_articles():
add_daily_articles_to_db(article_obj)


async def populate_daily_articles_cna():
# create articles folder if doesnt exist
if "articles" not in os.listdir("./src/scrapers/cna"):
os.mkdir("./src/scrapers/cna/articles")

yesterday = datetime.combine(datetime.now() - timedelta(days=1), time.min)

# this function doesnt care about duplicates and just destroys the json
await scrape_from_date(start_date=yesterday)
# this function already checks for articles not in db that are in json
# may salvage the broken json
await process_all_categories()


def process_new_articles() -> list[dict]:
with Session(engine) as session:
result = session.scalars(
Expand Down Expand Up @@ -136,7 +153,7 @@ def process_new_articles() -> list[dict]:

async def run(limit: int = 30):
# Add new articles to database
# populate_daily_articles()
await populate_daily_articles_cna()
# ADD CNA HERE.
# Process new articles i.e. find articles that we have not generated events for
articles = get_articles(limit)
Expand Down
2 changes: 1 addition & 1 deletion backend/src/scrapers/cna/data/Asia.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion backend/src/scrapers/cna/data/Business.json

Large diffs are not rendered by default.

6,391 changes: 1 addition & 6,390 deletions backend/src/scrapers/cna/data/CNA Explains.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion backend/src/scrapers/cna/data/CNA Insider.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion backend/src/scrapers/cna/data/Commentary.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion backend/src/scrapers/cna/data/East Asia.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion backend/src/scrapers/cna/data/Singapore.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion backend/src/scrapers/cna/data/Sport.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion backend/src/scrapers/cna/data/World.json

Large diffs are not rendered by default.

18 changes: 18 additions & 0 deletions backend/src/scrapers/cna/process.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import argparse
import asyncio
import json
from sqlalchemy import select
from sqlalchemy.orm import Session
from src.events.models import Article, ArticleSource
from src.common.database import engine
Expand Down Expand Up @@ -49,6 +50,10 @@ class CNAArticle(BaseModel):
async def process(category: str):
with open(f"./src/scrapers/cna/data/{category}.json") as f:
data = json.load(f)

with Session(engine) as session:
urls = set(session.scalars(select(Article.url)))
count = 0
for page_index, page in enumerate(data):
for index, item in enumerate(page):
try:
Expand All @@ -58,8 +63,19 @@ async def process(category: str):
continue
if article.uuid in processed_ids:
continue
if article.absolute_url in urls:
continue
processed_ids.add(article.uuid)

# check again in case there are duplicates
# probably a slight race condition here
with Session(engine) as session:
article_orm = session.scalar(
select(Article).where(Article.url == article.absolute_url)
)
if article_orm:
continue

# Read body text from scrape.py
with open(
os.path.join(folder_path, f"{article.uuid}_{category}.txt")
Expand All @@ -84,10 +100,12 @@ async def process(category: str):
with Session(engine) as session:
session.add(article_orm)
session.commit()
count += 1

except Exception as e:
print(f"{category}: something went wrong with {page_index}, {index}")
print(e)
print(f"Added {count} articles for {category}")


async def process_all_categories():
Expand Down
69 changes: 50 additions & 19 deletions backend/src/scrapers/cna/scrape.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,15 @@
from datetime import datetime
from bs4 import BeautifulSoup
from sqlalchemy import select
from sqlalchemy.orm import Session
from src.common.database import engine
import httpx
import random
import json
import asyncio

from src.events.models import Article


def get_url(category_slug: str, page: int):
return f"https://www.channelnewsasia.com/api/v1/infinitelisting/{category_slug}?_format=json&viewMode=infinite_scroll_listing&page={page}"
Expand All @@ -22,25 +28,46 @@ def get_url(category_slug: str, page: int):
# Sustainability section does not have a view more button in CNA
}

with Session(engine) as session:
urls = set(session.scalars(select(Article.url)))


async def scrape(category: str, pages: int = 10):
async def scrape(category: str, pages: int = 10, start_date: datetime = None):
slug = CATEGORIES[category]
data = []
with open(f"./src/scrapers/cna/data/{category}.json") as f:
data = json.load(f)
error_count = 0
async with httpx.AsyncClient() as client:
for i in range(pages):
url = get_url(slug, i)
try:
resp = (await client.get(url)).json()
data.append(resp["result"])
result = resp["result"]
terminate = False
if start_date:
result = [
row
for row in result
if datetime.fromisoformat(row["date"].split("T")[0])
>= start_date
]
if len(result) != len(resp["result"]):
terminate = True
# eliminate duplicates
result = [row for row in result if row["absolute_url"] not in urls]
data.append(result)
if terminate:
print("Reached end of date limit", i, category)
break
await asyncio.sleep(3 + (4 * random.random()))
# save every 10 pages in case something bad happens
error_count = 0
if i % 10 == 0:
print(f"{category}: {i}")
with open(f"./src/scrapers/cna/data/{category}.json", "w") as f:
json.dump(data, f)
except: # noqa: E722
except Exception as e: # noqa: E722
print(e)
print(
f"Something went wrong for {category}, {i}. Might have ran out of pages."
)
Expand All @@ -51,12 +78,14 @@ async def scrape(category: str, pages: int = 10):
print(f"Terminated - {category}")
return

with open(f"./src/scrapers/data/{category}.json", "w") as f:
with open(f"./src/scrapers/cna/data/{category}.json", "w") as f:
json.dump(data, f)


async def scrape_index():
await asyncio.gather(*[scrape(category, 200) for category in CATEGORIES])
async def scrape_index(start_date: datetime = None):
await asyncio.gather(
*[scrape(category, 200, start_date) for category in CATEGORIES]
)


async def scrape_single_page(url):
Expand All @@ -75,17 +104,18 @@ async def scrape_single_page(url):
scraped_slugs = set()


async def scrape_category(category: str):
async def scrape_category(category: str, start_date: datetime = None):
with open(f"./src/scrapers/cna/data/{category}.json") as f:
data = json.load(f)
skipped = 0

for index, page in enumerate(data):
if category == "Asia" and index <= 58:
continue
for item in page:
if item["type"] != "article":
continue
date = item["date"]
if start_date and datetime.fromisoformat(date.split("T")[0]) < start_date:
continue
try:
absolute_url = item["absolute_url"]
if absolute_url in scraped_slugs:
Expand All @@ -103,15 +133,16 @@ async def scrape_category(category: str):
print(f"scraped: {category}, {index}(x10)")


async def scrape_all_categories():
asyncio.gather(*[scrape_category(category) for category in CATEGORIES])
async def scrape_all_categories(start_date: datetime = None):
await asyncio.gather(
*[scrape_category(category, start_date=start_date) for category in CATEGORIES]
)


async def scrape_from_date(start_date: datetime):
await scrape_index(start_date=start_date)
await scrape_all_categories(start_date=start_date)


if __name__ == "__main__":
# asyncio.run(scrape_index())
# asyncio.run(
# scrape_single_page(
# "https://www.channelnewsasia.com/experiences/world-50-best-hotels-2024-4614831"
# )
# )
asyncio.run(scrape_all_categories())
asyncio.run(scrape_from_date(start_date=datetime(2024, 9, 20)))
4 changes: 4 additions & 0 deletions backend/src/scripts/populate.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,7 @@ def set_up():
populate()
# store analyses in vector store
store_documents()


if __name__ == "__main__":
set_up()

0 comments on commit 4034677

Please sign in to comment.