From 1c46a3e10f3f485919ebbdd586329449ca68325a Mon Sep 17 00:00:00 2001 From: seeleng Date: Sat, 21 Sep 2024 16:47:32 +0800 Subject: [PATCH 1/5] feat: add event and article tables --- ...e6552f0_create_article_and_event_tables.py | 64 +++++++++++++++++++ backend/src/events/models.py | 56 ++++++++++++++++ backend/src/utils/models.py | 3 +- 3 files changed, 122 insertions(+), 1 deletion(-) create mode 100644 backend/alembic/versions/423a1e6552f0_create_article_and_event_tables.py create mode 100644 backend/src/events/models.py diff --git a/backend/alembic/versions/423a1e6552f0_create_article_and_event_tables.py b/backend/alembic/versions/423a1e6552f0_create_article_and_event_tables.py new file mode 100644 index 00000000..4f215f03 --- /dev/null +++ b/backend/alembic/versions/423a1e6552f0_create_article_and_event_tables.py @@ -0,0 +1,64 @@ +"""Create article and event tables + +Revision ID: 423a1e6552f0 +Revises: 681607ac9aeb +Create Date: 2024-09-21 16:44:37.663284 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql + +# revision identifiers, used by Alembic. +revision: str = '423a1e6552f0' +down_revision: Union[str, None] = '681607ac9aeb' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + sa.Enum('CNA', 'GUARDIAN', name='articlesource').create(op.get_bind()) + op.create_table('article', + sa.Column('id', sa.Integer(), nullable=False), + sa.Column('title', sa.String(), nullable=False), + sa.Column('summary', sa.String(), nullable=False), + sa.Column('body', sa.String(), nullable=False), + sa.Column('url', sa.String(), nullable=False), + sa.Column('source', postgresql.ENUM('CNA', 'GUARDIAN', name='articlesource', create_type=False), nullable=False), + sa.PrimaryKeyConstraint('id') + ) + op.create_table('category', + sa.Column('id', sa.Integer(), nullable=False), + sa.Column('name', sa.String(), nullable=False), + sa.PrimaryKeyConstraint('id') + ) + op.create_table('event', + sa.Column('id', sa.Integer(), nullable=False), + sa.Column('title', sa.String(), nullable=False), + sa.Column('description', sa.String(), nullable=False), + sa.Column('analysis', sa.String(), nullable=False), + sa.Column('duplicate', sa.Boolean(), nullable=False), + sa.Column('date', sa.DateTime(), nullable=False), + sa.PrimaryKeyConstraint('id') + ) + op.create_table('event_category', + sa.Column('event_id', sa.Integer(), nullable=False), + sa.Column('category_id', sa.Integer(), nullable=False), + sa.ForeignKeyConstraint(['category_id'], ['category.id'], ), + sa.ForeignKeyConstraint(['event_id'], ['event.id'], ), + sa.PrimaryKeyConstraint('event_id', 'category_id') + ) + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.drop_table('event_category') + op.drop_table('event') + op.drop_table('category') + op.drop_table('article') + sa.Enum('CNA', 'GUARDIAN', name='articlesource').drop(op.get_bind()) + # ### end Alembic commands ### diff --git a/backend/src/events/models.py b/backend/src/events/models.py new file mode 100644 index 00000000..8b531328 --- /dev/null +++ b/backend/src/events/models.py @@ -0,0 +1,56 @@ +from enum import Enum +from sqlalchemy.orm import Mapped, mapped_column, relationship +from sqlalchemy import ForeignKey +from datetime import datetime +from src.common.base import Base + + +class ArticleSource(str, Enum): + CNA = "CNA" + GUARDIAN = "GUARDIAN" + + +class Article(Base): + __tablename__ = "article" + + id: Mapped[int] = mapped_column(primary_key=True) + title: Mapped[str] + summary: Mapped[str] + body: Mapped[str] + url: Mapped[str] + source: Mapped[ArticleSource] + + +class Event(Base): + __tablename__ = "event" + + id: Mapped[int] = mapped_column(primary_key=True) + title: Mapped[str] + description: Mapped[str] + analysis: Mapped[str] + duplicate: Mapped[bool] + date: Mapped[datetime] + + categories: Mapped[list["Category"]] = relationship( + back_populates="events", secondary="event_category" + ) + + +class Category(Base): + __tablename__ = "category" + + id: Mapped[int] = mapped_column(primary_key=True) + name: Mapped[str] + + events: Mapped[list[Event]] = relationship( + secondary="event_category", back_populates="categories" + ) + + +class EventCategory(Base): + __tablename__ = "event_category" + + event_id: Mapped[int] = mapped_column(ForeignKey("event.id"), primary_key=True) + category_id: Mapped[int] = mapped_column( + ForeignKey("category.id"), primary_key=True + ) diff --git a/backend/src/utils/models.py b/backend/src/utils/models.py index cf363d23..41503e91 100644 --- a/backend/src/utils/models.py +++ b/backend/src/utils/models.py @@ -1,3 +1,4 @@ # this is just to load models in alembic -from src.auth import models # noqa: F401 +from src.auth import models as auth_models # noqa: F401 +from src.events import models as event_models # noqa: F401 From 5e9fa3947a12221c27628ed1ce234bfa427cfb54 Mon Sep 17 00:00:00 2001 From: seeleng Date: Sat, 21 Sep 2024 18:26:38 +0800 Subject: [PATCH 2/5] fix: add date column to article --- ...e84e13568072_add_columns_to_event_table.py | 34 +++++++++++++++++++ .../f3e847c3ee9d_add_article_date_column.py | 30 ++++++++++++++++ backend/src/events/models.py | 7 ++++ 3 files changed, 71 insertions(+) create mode 100644 backend/alembic/versions/e84e13568072_add_columns_to_event_table.py create mode 100644 backend/alembic/versions/f3e847c3ee9d_add_article_date_column.py diff --git a/backend/alembic/versions/e84e13568072_add_columns_to_event_table.py b/backend/alembic/versions/e84e13568072_add_columns_to_event_table.py new file mode 100644 index 00000000..97cdae7d --- /dev/null +++ b/backend/alembic/versions/e84e13568072_add_columns_to_event_table.py @@ -0,0 +1,34 @@ +"""Add columns to event table + +Revision ID: e84e13568072 +Revises: 423a1e6552f0 +Create Date: 2024-09-21 17:21:24.515814 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = 'e84e13568072' +down_revision: Union[str, None] = '423a1e6552f0' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.add_column('event', sa.Column('is_singapore', sa.Boolean(), nullable=False)) + op.add_column('event', sa.Column('original_article_id', sa.Integer(), nullable=False)) + op.create_foreign_key(None, 'event', 'article', ['original_article_id'], ['id']) + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.drop_constraint(None, 'event', type_='foreignkey') + op.drop_column('event', 'original_article_id') + op.drop_column('event', 'is_singapore') + # ### end Alembic commands ### diff --git a/backend/alembic/versions/f3e847c3ee9d_add_article_date_column.py b/backend/alembic/versions/f3e847c3ee9d_add_article_date_column.py new file mode 100644 index 00000000..fbbde5b5 --- /dev/null +++ b/backend/alembic/versions/f3e847c3ee9d_add_article_date_column.py @@ -0,0 +1,30 @@ +"""Add article date column + +Revision ID: f3e847c3ee9d +Revises: e84e13568072 +Create Date: 2024-09-21 18:25:05.912853 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = 'f3e847c3ee9d' +down_revision: Union[str, None] = 'e84e13568072' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.add_column('article', sa.Column('date', sa.DateTime(), nullable=False)) + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.drop_column('article', 'date') + # ### end Alembic commands ### diff --git a/backend/src/events/models.py b/backend/src/events/models.py index 8b531328..0e00ac6b 100644 --- a/backend/src/events/models.py +++ b/backend/src/events/models.py @@ -19,6 +19,9 @@ class Article(Base): body: Mapped[str] url: Mapped[str] source: Mapped[ArticleSource] + date: Mapped[datetime] + + event: Mapped["Event"] = relationship(back_populates="original_article") class Event(Base): @@ -30,11 +33,15 @@ class Event(Base): analysis: Mapped[str] duplicate: Mapped[bool] date: Mapped[datetime] + is_singapore: Mapped[bool] + original_article_id: Mapped[int] = mapped_column(ForeignKey("article.id")) categories: Mapped[list["Category"]] = relationship( back_populates="events", secondary="event_category" ) + original_article: Mapped[Article] = relationship(back_populates="event") + class Category(Base): __tablename__ = "category" From 1c5e3865516a6b3d02a69e6bf5ced18f68263530 Mon Sep 17 00:00:00 2001 From: seeleng Date: Sat, 21 Sep 2024 18:26:59 +0800 Subject: [PATCH 3/5] feat: add process script for cna --- backend/src/scrapers/cna/process.py | 96 +++++++++++++++++++++++++++++ 1 file changed, 96 insertions(+) create mode 100644 backend/src/scrapers/cna/process.py diff --git a/backend/src/scrapers/cna/process.py b/backend/src/scrapers/cna/process.py new file mode 100644 index 00000000..029d6c4f --- /dev/null +++ b/backend/src/scrapers/cna/process.py @@ -0,0 +1,96 @@ +import argparse +import asyncio +import json +from sqlalchemy.orm import Session +from src.events.models import Article, ArticleSource +from src.common.database import engine +from pydantic import BaseModel, ConfigDict +from bs4 import BeautifulSoup +import os + + +parser = argparse.ArgumentParser() +parser.add_argument("-i", "--input", help="input folder path") + +args = parser.parse_args() +folder_path = args.input + + +processed_ids = set() + +CATEGORIES = [ + "Asia", + "East Asia", + "Singapore", + "World", + "Commentary", + "CNA Explains", + "Business", + "Sport", + "CNA Insider", +] + + +class CNAArticle(BaseModel): + model_config = ConfigDict(extra="allow") + + type: str + uuid: str + nid: str + title: str + title_url: str + absolute_url: str + field_summary: str | None = None + description: str | None = None + release_date: str + + +async def process(category: str): + with open(f"./src/scrapers/cna/data/{category}.json") as f: + data = json.load(f) + for page_index, page in enumerate(data): + for index, item in enumerate(page): + try: + article = CNAArticle.model_validate(item) + + if article.type != "article": + continue + if article.uuid in processed_ids: + continue + processed_ids.add(article.uuid) + + # Read body text from scrape.py + with open( + os.path.join(folder_path, f"{article.uuid}_{category}.txt") + ) as f: + body = f.read() + + if not body.strip(): + continue + + # Add to database + article_orm = Article( + title=article.title, + summary=BeautifulSoup(article.description).getText() + if article.description + else "", + url=article.absolute_url, + source=ArticleSource.CNA, + body=body.strip(), + date=article.release_date, + ) + with Session(engine) as session: + session.add(article_orm) + session.commit() + + except Exception as e: + print(f"{category}: something went wrong with {page_index}, {index}") + print(e) + + +async def process_all_categories(): + asyncio.gather(*[process(category) for category in CATEGORIES]) + + +if __name__ == "__main__": + asyncio.run(process_all_categories()) From 15646dc7e4f182666856114480dc3d5d5220cab0 Mon Sep 17 00:00:00 2001 From: seeleng Date: Sat, 21 Sep 2024 19:16:58 +0800 Subject: [PATCH 4/5] feat: add guardian process script --- backend/src/scrapers/guardian/process.py | 46 ++++++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 backend/src/scrapers/guardian/process.py diff --git a/backend/src/scrapers/guardian/process.py b/backend/src/scrapers/guardian/process.py new file mode 100644 index 00000000..367af7ea --- /dev/null +++ b/backend/src/scrapers/guardian/process.py @@ -0,0 +1,46 @@ +import argparse + +import json +from sqlalchemy.orm import Session +from src.events.models import Article, ArticleSource +from src.common.database import engine +from pydantic import BaseModel, ConfigDict +from bs4 import BeautifulSoup +import os +from pprint import pprint + + +parser = argparse.ArgumentParser() +parser.add_argument("-i", "--input", help="input folder path") +args = parser.parse_args() + + +class GuardianArticleFields(BaseModel): + model_config = ConfigDict(extra="allow") + bodyText: str + trailText: str | None = None + + +class GuardianArticle(BaseModel): + model_config = ConfigDict(extra="allow") + fields: GuardianArticleFields + webUrl: str + webTitle: str + webPublicationDate: str + + +with open(args.input) as f: + data = json.load(f) + for row in data: + article = GuardianArticle.model_validate(row) + article_orm = Article( + title=article.webTitle, + summary=article.fields.trailText if article.fields.trailText else "", + url=article.webUrl, + source=ArticleSource.GUARDIAN, + body=article.fields.bodyText, + date=article.webPublicationDate, + ) + with Session(engine) as session: + session.add(article_orm) + session.commit() From d5c3116a86c2aba903641d4c0f859d6cfa67988f Mon Sep 17 00:00:00 2001 From: seeleng Date: Sat, 21 Sep 2024 19:18:35 +0800 Subject: [PATCH 5/5] style: ruff --- ...e6552f0_create_article_and_event_tables.py | 83 +++++++++++-------- ...e84e13568072_add_columns_to_event_table.py | 19 +++-- .../f3e847c3ee9d_add_article_date_column.py | 9 +- backend/src/scrapers/guardian/process.py | 3 - 4 files changed, 65 insertions(+), 49 deletions(-) diff --git a/backend/alembic/versions/423a1e6552f0_create_article_and_event_tables.py b/backend/alembic/versions/423a1e6552f0_create_article_and_event_tables.py index 4f215f03..43ca53d6 100644 --- a/backend/alembic/versions/423a1e6552f0_create_article_and_event_tables.py +++ b/backend/alembic/versions/423a1e6552f0_create_article_and_event_tables.py @@ -5,6 +5,7 @@ Create Date: 2024-09-21 16:44:37.663284 """ + from typing import Sequence, Union from alembic import op @@ -12,53 +13,67 @@ from sqlalchemy.dialects import postgresql # revision identifiers, used by Alembic. -revision: str = '423a1e6552f0' -down_revision: Union[str, None] = '681607ac9aeb' +revision: str = "423a1e6552f0" +down_revision: Union[str, None] = "681607ac9aeb" branch_labels: Union[str, Sequence[str], None] = None depends_on: Union[str, Sequence[str], None] = None def upgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### - sa.Enum('CNA', 'GUARDIAN', name='articlesource').create(op.get_bind()) - op.create_table('article', - sa.Column('id', sa.Integer(), nullable=False), - sa.Column('title', sa.String(), nullable=False), - sa.Column('summary', sa.String(), nullable=False), - sa.Column('body', sa.String(), nullable=False), - sa.Column('url', sa.String(), nullable=False), - sa.Column('source', postgresql.ENUM('CNA', 'GUARDIAN', name='articlesource', create_type=False), nullable=False), - sa.PrimaryKeyConstraint('id') + sa.Enum("CNA", "GUARDIAN", name="articlesource").create(op.get_bind()) + op.create_table( + "article", + sa.Column("id", sa.Integer(), nullable=False), + sa.Column("title", sa.String(), nullable=False), + sa.Column("summary", sa.String(), nullable=False), + sa.Column("body", sa.String(), nullable=False), + sa.Column("url", sa.String(), nullable=False), + sa.Column( + "source", + postgresql.ENUM("CNA", "GUARDIAN", name="articlesource", create_type=False), + nullable=False, + ), + sa.PrimaryKeyConstraint("id"), ) - op.create_table('category', - sa.Column('id', sa.Integer(), nullable=False), - sa.Column('name', sa.String(), nullable=False), - sa.PrimaryKeyConstraint('id') + op.create_table( + "category", + sa.Column("id", sa.Integer(), nullable=False), + sa.Column("name", sa.String(), nullable=False), + sa.PrimaryKeyConstraint("id"), ) - op.create_table('event', - sa.Column('id', sa.Integer(), nullable=False), - sa.Column('title', sa.String(), nullable=False), - sa.Column('description', sa.String(), nullable=False), - sa.Column('analysis', sa.String(), nullable=False), - sa.Column('duplicate', sa.Boolean(), nullable=False), - sa.Column('date', sa.DateTime(), nullable=False), - sa.PrimaryKeyConstraint('id') + op.create_table( + "event", + sa.Column("id", sa.Integer(), nullable=False), + sa.Column("title", sa.String(), nullable=False), + sa.Column("description", sa.String(), nullable=False), + sa.Column("analysis", sa.String(), nullable=False), + sa.Column("duplicate", sa.Boolean(), nullable=False), + sa.Column("date", sa.DateTime(), nullable=False), + sa.PrimaryKeyConstraint("id"), ) - op.create_table('event_category', - sa.Column('event_id', sa.Integer(), nullable=False), - sa.Column('category_id', sa.Integer(), nullable=False), - sa.ForeignKeyConstraint(['category_id'], ['category.id'], ), - sa.ForeignKeyConstraint(['event_id'], ['event.id'], ), - sa.PrimaryKeyConstraint('event_id', 'category_id') + op.create_table( + "event_category", + sa.Column("event_id", sa.Integer(), nullable=False), + sa.Column("category_id", sa.Integer(), nullable=False), + sa.ForeignKeyConstraint( + ["category_id"], + ["category.id"], + ), + sa.ForeignKeyConstraint( + ["event_id"], + ["event.id"], + ), + sa.PrimaryKeyConstraint("event_id", "category_id"), ) # ### end Alembic commands ### def downgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### - op.drop_table('event_category') - op.drop_table('event') - op.drop_table('category') - op.drop_table('article') - sa.Enum('CNA', 'GUARDIAN', name='articlesource').drop(op.get_bind()) + op.drop_table("event_category") + op.drop_table("event") + op.drop_table("category") + op.drop_table("article") + sa.Enum("CNA", "GUARDIAN", name="articlesource").drop(op.get_bind()) # ### end Alembic commands ### diff --git a/backend/alembic/versions/e84e13568072_add_columns_to_event_table.py b/backend/alembic/versions/e84e13568072_add_columns_to_event_table.py index 97cdae7d..bb4a052f 100644 --- a/backend/alembic/versions/e84e13568072_add_columns_to_event_table.py +++ b/backend/alembic/versions/e84e13568072_add_columns_to_event_table.py @@ -5,6 +5,7 @@ Create Date: 2024-09-21 17:21:24.515814 """ + from typing import Sequence, Union from alembic import op @@ -12,23 +13,25 @@ # revision identifiers, used by Alembic. -revision: str = 'e84e13568072' -down_revision: Union[str, None] = '423a1e6552f0' +revision: str = "e84e13568072" +down_revision: Union[str, None] = "423a1e6552f0" branch_labels: Union[str, Sequence[str], None] = None depends_on: Union[str, Sequence[str], None] = None def upgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### - op.add_column('event', sa.Column('is_singapore', sa.Boolean(), nullable=False)) - op.add_column('event', sa.Column('original_article_id', sa.Integer(), nullable=False)) - op.create_foreign_key(None, 'event', 'article', ['original_article_id'], ['id']) + op.add_column("event", sa.Column("is_singapore", sa.Boolean(), nullable=False)) + op.add_column( + "event", sa.Column("original_article_id", sa.Integer(), nullable=False) + ) + op.create_foreign_key(None, "event", "article", ["original_article_id"], ["id"]) # ### end Alembic commands ### def downgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### - op.drop_constraint(None, 'event', type_='foreignkey') - op.drop_column('event', 'original_article_id') - op.drop_column('event', 'is_singapore') + op.drop_constraint(None, "event", type_="foreignkey") + op.drop_column("event", "original_article_id") + op.drop_column("event", "is_singapore") # ### end Alembic commands ### diff --git a/backend/alembic/versions/f3e847c3ee9d_add_article_date_column.py b/backend/alembic/versions/f3e847c3ee9d_add_article_date_column.py index fbbde5b5..21c387af 100644 --- a/backend/alembic/versions/f3e847c3ee9d_add_article_date_column.py +++ b/backend/alembic/versions/f3e847c3ee9d_add_article_date_column.py @@ -5,6 +5,7 @@ Create Date: 2024-09-21 18:25:05.912853 """ + from typing import Sequence, Union from alembic import op @@ -12,19 +13,19 @@ # revision identifiers, used by Alembic. -revision: str = 'f3e847c3ee9d' -down_revision: Union[str, None] = 'e84e13568072' +revision: str = "f3e847c3ee9d" +down_revision: Union[str, None] = "e84e13568072" branch_labels: Union[str, Sequence[str], None] = None depends_on: Union[str, Sequence[str], None] = None def upgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### - op.add_column('article', sa.Column('date', sa.DateTime(), nullable=False)) + op.add_column("article", sa.Column("date", sa.DateTime(), nullable=False)) # ### end Alembic commands ### def downgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### - op.drop_column('article', 'date') + op.drop_column("article", "date") # ### end Alembic commands ### diff --git a/backend/src/scrapers/guardian/process.py b/backend/src/scrapers/guardian/process.py index 367af7ea..9ef73856 100644 --- a/backend/src/scrapers/guardian/process.py +++ b/backend/src/scrapers/guardian/process.py @@ -5,9 +5,6 @@ from src.events.models import Article, ArticleSource from src.common.database import engine from pydantic import BaseModel, ConfigDict -from bs4 import BeautifulSoup -import os -from pprint import pprint parser = argparse.ArgumentParser()