diff --git a/backend/alembic/versions/423a1e6552f0_create_article_and_event_tables.py b/backend/alembic/versions/423a1e6552f0_create_article_and_event_tables.py new file mode 100644 index 00000000..43ca53d6 --- /dev/null +++ b/backend/alembic/versions/423a1e6552f0_create_article_and_event_tables.py @@ -0,0 +1,79 @@ +"""Create article and event tables + +Revision ID: 423a1e6552f0 +Revises: 681607ac9aeb +Create Date: 2024-09-21 16:44:37.663284 + +""" + +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql + +# revision identifiers, used by Alembic. +revision: str = "423a1e6552f0" +down_revision: Union[str, None] = "681607ac9aeb" +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + sa.Enum("CNA", "GUARDIAN", name="articlesource").create(op.get_bind()) + op.create_table( + "article", + sa.Column("id", sa.Integer(), nullable=False), + sa.Column("title", sa.String(), nullable=False), + sa.Column("summary", sa.String(), nullable=False), + sa.Column("body", sa.String(), nullable=False), + sa.Column("url", sa.String(), nullable=False), + sa.Column( + "source", + postgresql.ENUM("CNA", "GUARDIAN", name="articlesource", create_type=False), + nullable=False, + ), + sa.PrimaryKeyConstraint("id"), + ) + op.create_table( + "category", + sa.Column("id", sa.Integer(), nullable=False), + sa.Column("name", sa.String(), nullable=False), + sa.PrimaryKeyConstraint("id"), + ) + op.create_table( + "event", + sa.Column("id", sa.Integer(), nullable=False), + sa.Column("title", sa.String(), nullable=False), + sa.Column("description", sa.String(), nullable=False), + sa.Column("analysis", sa.String(), nullable=False), + sa.Column("duplicate", sa.Boolean(), nullable=False), + sa.Column("date", sa.DateTime(), nullable=False), + sa.PrimaryKeyConstraint("id"), + ) + op.create_table( + "event_category", + sa.Column("event_id", sa.Integer(), nullable=False), + sa.Column("category_id", sa.Integer(), nullable=False), + sa.ForeignKeyConstraint( + ["category_id"], + ["category.id"], + ), + sa.ForeignKeyConstraint( + ["event_id"], + ["event.id"], + ), + sa.PrimaryKeyConstraint("event_id", "category_id"), + ) + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.drop_table("event_category") + op.drop_table("event") + op.drop_table("category") + op.drop_table("article") + sa.Enum("CNA", "GUARDIAN", name="articlesource").drop(op.get_bind()) + # ### end Alembic commands ### diff --git a/backend/alembic/versions/e84e13568072_add_columns_to_event_table.py b/backend/alembic/versions/e84e13568072_add_columns_to_event_table.py new file mode 100644 index 00000000..bb4a052f --- /dev/null +++ b/backend/alembic/versions/e84e13568072_add_columns_to_event_table.py @@ -0,0 +1,37 @@ +"""Add columns to event table + +Revision ID: e84e13568072 +Revises: 423a1e6552f0 +Create Date: 2024-09-21 17:21:24.515814 + +""" + +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = "e84e13568072" +down_revision: Union[str, None] = "423a1e6552f0" +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.add_column("event", sa.Column("is_singapore", sa.Boolean(), nullable=False)) + op.add_column( + "event", sa.Column("original_article_id", sa.Integer(), nullable=False) + ) + op.create_foreign_key(None, "event", "article", ["original_article_id"], ["id"]) + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.drop_constraint(None, "event", type_="foreignkey") + op.drop_column("event", "original_article_id") + op.drop_column("event", "is_singapore") + # ### end Alembic commands ### diff --git a/backend/alembic/versions/f3e847c3ee9d_add_article_date_column.py b/backend/alembic/versions/f3e847c3ee9d_add_article_date_column.py new file mode 100644 index 00000000..21c387af --- /dev/null +++ b/backend/alembic/versions/f3e847c3ee9d_add_article_date_column.py @@ -0,0 +1,31 @@ +"""Add article date column + +Revision ID: f3e847c3ee9d +Revises: e84e13568072 +Create Date: 2024-09-21 18:25:05.912853 + +""" + +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = "f3e847c3ee9d" +down_revision: Union[str, None] = "e84e13568072" +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.add_column("article", sa.Column("date", sa.DateTime(), nullable=False)) + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.drop_column("article", "date") + # ### end Alembic commands ### diff --git a/backend/src/events/models.py b/backend/src/events/models.py new file mode 100644 index 00000000..0e00ac6b --- /dev/null +++ b/backend/src/events/models.py @@ -0,0 +1,63 @@ +from enum import Enum +from sqlalchemy.orm import Mapped, mapped_column, relationship +from sqlalchemy import ForeignKey +from datetime import datetime +from src.common.base import Base + + +class ArticleSource(str, Enum): + CNA = "CNA" + GUARDIAN = "GUARDIAN" + + +class Article(Base): + __tablename__ = "article" + + id: Mapped[int] = mapped_column(primary_key=True) + title: Mapped[str] + summary: Mapped[str] + body: Mapped[str] + url: Mapped[str] + source: Mapped[ArticleSource] + date: Mapped[datetime] + + event: Mapped["Event"] = relationship(back_populates="original_article") + + +class Event(Base): + __tablename__ = "event" + + id: Mapped[int] = mapped_column(primary_key=True) + title: Mapped[str] + description: Mapped[str] + analysis: Mapped[str] + duplicate: Mapped[bool] + date: Mapped[datetime] + is_singapore: Mapped[bool] + original_article_id: Mapped[int] = mapped_column(ForeignKey("article.id")) + + categories: Mapped[list["Category"]] = relationship( + back_populates="events", secondary="event_category" + ) + + original_article: Mapped[Article] = relationship(back_populates="event") + + +class Category(Base): + __tablename__ = "category" + + id: Mapped[int] = mapped_column(primary_key=True) + name: Mapped[str] + + events: Mapped[list[Event]] = relationship( + secondary="event_category", back_populates="categories" + ) + + +class EventCategory(Base): + __tablename__ = "event_category" + + event_id: Mapped[int] = mapped_column(ForeignKey("event.id"), primary_key=True) + category_id: Mapped[int] = mapped_column( + ForeignKey("category.id"), primary_key=True + ) diff --git a/backend/src/scrapers/cna/process.py b/backend/src/scrapers/cna/process.py new file mode 100644 index 00000000..029d6c4f --- /dev/null +++ b/backend/src/scrapers/cna/process.py @@ -0,0 +1,96 @@ +import argparse +import asyncio +import json +from sqlalchemy.orm import Session +from src.events.models import Article, ArticleSource +from src.common.database import engine +from pydantic import BaseModel, ConfigDict +from bs4 import BeautifulSoup +import os + + +parser = argparse.ArgumentParser() +parser.add_argument("-i", "--input", help="input folder path") + +args = parser.parse_args() +folder_path = args.input + + +processed_ids = set() + +CATEGORIES = [ + "Asia", + "East Asia", + "Singapore", + "World", + "Commentary", + "CNA Explains", + "Business", + "Sport", + "CNA Insider", +] + + +class CNAArticle(BaseModel): + model_config = ConfigDict(extra="allow") + + type: str + uuid: str + nid: str + title: str + title_url: str + absolute_url: str + field_summary: str | None = None + description: str | None = None + release_date: str + + +async def process(category: str): + with open(f"./src/scrapers/cna/data/{category}.json") as f: + data = json.load(f) + for page_index, page in enumerate(data): + for index, item in enumerate(page): + try: + article = CNAArticle.model_validate(item) + + if article.type != "article": + continue + if article.uuid in processed_ids: + continue + processed_ids.add(article.uuid) + + # Read body text from scrape.py + with open( + os.path.join(folder_path, f"{article.uuid}_{category}.txt") + ) as f: + body = f.read() + + if not body.strip(): + continue + + # Add to database + article_orm = Article( + title=article.title, + summary=BeautifulSoup(article.description).getText() + if article.description + else "", + url=article.absolute_url, + source=ArticleSource.CNA, + body=body.strip(), + date=article.release_date, + ) + with Session(engine) as session: + session.add(article_orm) + session.commit() + + except Exception as e: + print(f"{category}: something went wrong with {page_index}, {index}") + print(e) + + +async def process_all_categories(): + asyncio.gather(*[process(category) for category in CATEGORIES]) + + +if __name__ == "__main__": + asyncio.run(process_all_categories()) diff --git a/backend/src/scrapers/guardian/process.py b/backend/src/scrapers/guardian/process.py new file mode 100644 index 00000000..9ef73856 --- /dev/null +++ b/backend/src/scrapers/guardian/process.py @@ -0,0 +1,43 @@ +import argparse + +import json +from sqlalchemy.orm import Session +from src.events.models import Article, ArticleSource +from src.common.database import engine +from pydantic import BaseModel, ConfigDict + + +parser = argparse.ArgumentParser() +parser.add_argument("-i", "--input", help="input folder path") +args = parser.parse_args() + + +class GuardianArticleFields(BaseModel): + model_config = ConfigDict(extra="allow") + bodyText: str + trailText: str | None = None + + +class GuardianArticle(BaseModel): + model_config = ConfigDict(extra="allow") + fields: GuardianArticleFields + webUrl: str + webTitle: str + webPublicationDate: str + + +with open(args.input) as f: + data = json.load(f) + for row in data: + article = GuardianArticle.model_validate(row) + article_orm = Article( + title=article.webTitle, + summary=article.fields.trailText if article.fields.trailText else "", + url=article.webUrl, + source=ArticleSource.GUARDIAN, + body=article.fields.bodyText, + date=article.webPublicationDate, + ) + with Session(engine) as session: + session.add(article_orm) + session.commit() diff --git a/backend/src/utils/models.py b/backend/src/utils/models.py index cf363d23..41503e91 100644 --- a/backend/src/utils/models.py +++ b/backend/src/utils/models.py @@ -1,3 +1,4 @@ # this is just to load models in alembic -from src.auth import models # noqa: F401 +from src.auth import models as auth_models # noqa: F401 +from src.events import models as event_models # noqa: F401