Skip to content

Commit

Permalink
Merge pull request #17 from cs3216-a3-group-4/seeleng/article-preproc…
Browse files Browse the repository at this point in the history
…essing

feat: add event preprocessing
  • Loading branch information
chloeelim authored Sep 21, 2024
2 parents 0a409fe + d5c3116 commit 74092ed
Show file tree
Hide file tree
Showing 7 changed files with 351 additions and 1 deletion.
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
"""Create article and event tables
Revision ID: 423a1e6552f0
Revises: 681607ac9aeb
Create Date: 2024-09-21 16:44:37.663284
"""

from typing import Sequence, Union

from alembic import op
import sqlalchemy as sa
from sqlalchemy.dialects import postgresql

# revision identifiers, used by Alembic.
revision: str = "423a1e6552f0"
down_revision: Union[str, None] = "681607ac9aeb"
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None


def upgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
sa.Enum("CNA", "GUARDIAN", name="articlesource").create(op.get_bind())
op.create_table(
"article",
sa.Column("id", sa.Integer(), nullable=False),
sa.Column("title", sa.String(), nullable=False),
sa.Column("summary", sa.String(), nullable=False),
sa.Column("body", sa.String(), nullable=False),
sa.Column("url", sa.String(), nullable=False),
sa.Column(
"source",
postgresql.ENUM("CNA", "GUARDIAN", name="articlesource", create_type=False),
nullable=False,
),
sa.PrimaryKeyConstraint("id"),
)
op.create_table(
"category",
sa.Column("id", sa.Integer(), nullable=False),
sa.Column("name", sa.String(), nullable=False),
sa.PrimaryKeyConstraint("id"),
)
op.create_table(
"event",
sa.Column("id", sa.Integer(), nullable=False),
sa.Column("title", sa.String(), nullable=False),
sa.Column("description", sa.String(), nullable=False),
sa.Column("analysis", sa.String(), nullable=False),
sa.Column("duplicate", sa.Boolean(), nullable=False),
sa.Column("date", sa.DateTime(), nullable=False),
sa.PrimaryKeyConstraint("id"),
)
op.create_table(
"event_category",
sa.Column("event_id", sa.Integer(), nullable=False),
sa.Column("category_id", sa.Integer(), nullable=False),
sa.ForeignKeyConstraint(
["category_id"],
["category.id"],
),
sa.ForeignKeyConstraint(
["event_id"],
["event.id"],
),
sa.PrimaryKeyConstraint("event_id", "category_id"),
)
# ### end Alembic commands ###


def downgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
op.drop_table("event_category")
op.drop_table("event")
op.drop_table("category")
op.drop_table("article")
sa.Enum("CNA", "GUARDIAN", name="articlesource").drop(op.get_bind())
# ### end Alembic commands ###
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
"""Add columns to event table
Revision ID: e84e13568072
Revises: 423a1e6552f0
Create Date: 2024-09-21 17:21:24.515814
"""

from typing import Sequence, Union

from alembic import op
import sqlalchemy as sa


# revision identifiers, used by Alembic.
revision: str = "e84e13568072"
down_revision: Union[str, None] = "423a1e6552f0"
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None


def upgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
op.add_column("event", sa.Column("is_singapore", sa.Boolean(), nullable=False))
op.add_column(
"event", sa.Column("original_article_id", sa.Integer(), nullable=False)
)
op.create_foreign_key(None, "event", "article", ["original_article_id"], ["id"])
# ### end Alembic commands ###


def downgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
op.drop_constraint(None, "event", type_="foreignkey")
op.drop_column("event", "original_article_id")
op.drop_column("event", "is_singapore")
# ### end Alembic commands ###
31 changes: 31 additions & 0 deletions backend/alembic/versions/f3e847c3ee9d_add_article_date_column.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
"""Add article date column
Revision ID: f3e847c3ee9d
Revises: e84e13568072
Create Date: 2024-09-21 18:25:05.912853
"""

from typing import Sequence, Union

from alembic import op
import sqlalchemy as sa


# revision identifiers, used by Alembic.
revision: str = "f3e847c3ee9d"
down_revision: Union[str, None] = "e84e13568072"
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None


def upgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
op.add_column("article", sa.Column("date", sa.DateTime(), nullable=False))
# ### end Alembic commands ###


def downgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
op.drop_column("article", "date")
# ### end Alembic commands ###
63 changes: 63 additions & 0 deletions backend/src/events/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
from enum import Enum
from sqlalchemy.orm import Mapped, mapped_column, relationship
from sqlalchemy import ForeignKey
from datetime import datetime
from src.common.base import Base


class ArticleSource(str, Enum):
CNA = "CNA"
GUARDIAN = "GUARDIAN"


class Article(Base):
__tablename__ = "article"

id: Mapped[int] = mapped_column(primary_key=True)
title: Mapped[str]
summary: Mapped[str]
body: Mapped[str]
url: Mapped[str]
source: Mapped[ArticleSource]
date: Mapped[datetime]

event: Mapped["Event"] = relationship(back_populates="original_article")


class Event(Base):
__tablename__ = "event"

id: Mapped[int] = mapped_column(primary_key=True)
title: Mapped[str]
description: Mapped[str]
analysis: Mapped[str]
duplicate: Mapped[bool]
date: Mapped[datetime]
is_singapore: Mapped[bool]
original_article_id: Mapped[int] = mapped_column(ForeignKey("article.id"))

categories: Mapped[list["Category"]] = relationship(
back_populates="events", secondary="event_category"
)

original_article: Mapped[Article] = relationship(back_populates="event")


class Category(Base):
__tablename__ = "category"

id: Mapped[int] = mapped_column(primary_key=True)
name: Mapped[str]

events: Mapped[list[Event]] = relationship(
secondary="event_category", back_populates="categories"
)


class EventCategory(Base):
__tablename__ = "event_category"

event_id: Mapped[int] = mapped_column(ForeignKey("event.id"), primary_key=True)
category_id: Mapped[int] = mapped_column(
ForeignKey("category.id"), primary_key=True
)
96 changes: 96 additions & 0 deletions backend/src/scrapers/cna/process.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
import argparse
import asyncio
import json
from sqlalchemy.orm import Session
from src.events.models import Article, ArticleSource
from src.common.database import engine
from pydantic import BaseModel, ConfigDict
from bs4 import BeautifulSoup
import os


parser = argparse.ArgumentParser()
parser.add_argument("-i", "--input", help="input folder path")

args = parser.parse_args()
folder_path = args.input


processed_ids = set()

CATEGORIES = [
"Asia",
"East Asia",
"Singapore",
"World",
"Commentary",
"CNA Explains",
"Business",
"Sport",
"CNA Insider",
]


class CNAArticle(BaseModel):
model_config = ConfigDict(extra="allow")

type: str
uuid: str
nid: str
title: str
title_url: str
absolute_url: str
field_summary: str | None = None
description: str | None = None
release_date: str


async def process(category: str):
with open(f"./src/scrapers/cna/data/{category}.json") as f:
data = json.load(f)
for page_index, page in enumerate(data):
for index, item in enumerate(page):
try:
article = CNAArticle.model_validate(item)

if article.type != "article":
continue
if article.uuid in processed_ids:
continue
processed_ids.add(article.uuid)

# Read body text from scrape.py
with open(
os.path.join(folder_path, f"{article.uuid}_{category}.txt")
) as f:
body = f.read()

if not body.strip():
continue

# Add to database
article_orm = Article(
title=article.title,
summary=BeautifulSoup(article.description).getText()
if article.description
else "",
url=article.absolute_url,
source=ArticleSource.CNA,
body=body.strip(),
date=article.release_date,
)
with Session(engine) as session:
session.add(article_orm)
session.commit()

except Exception as e:
print(f"{category}: something went wrong with {page_index}, {index}")
print(e)


async def process_all_categories():
asyncio.gather(*[process(category) for category in CATEGORIES])


if __name__ == "__main__":
asyncio.run(process_all_categories())
43 changes: 43 additions & 0 deletions backend/src/scrapers/guardian/process.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import argparse

import json
from sqlalchemy.orm import Session
from src.events.models import Article, ArticleSource
from src.common.database import engine
from pydantic import BaseModel, ConfigDict


parser = argparse.ArgumentParser()
parser.add_argument("-i", "--input", help="input folder path")
args = parser.parse_args()


class GuardianArticleFields(BaseModel):
model_config = ConfigDict(extra="allow")
bodyText: str
trailText: str | None = None


class GuardianArticle(BaseModel):
model_config = ConfigDict(extra="allow")
fields: GuardianArticleFields
webUrl: str
webTitle: str
webPublicationDate: str


with open(args.input) as f:
data = json.load(f)
for row in data:
article = GuardianArticle.model_validate(row)
article_orm = Article(
title=article.webTitle,
summary=article.fields.trailText if article.fields.trailText else "",
url=article.webUrl,
source=ArticleSource.GUARDIAN,
body=article.fields.bodyText,
date=article.webPublicationDate,
)
with Session(engine) as session:
session.add(article_orm)
session.commit()
3 changes: 2 additions & 1 deletion backend/src/utils/models.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# this is just to load models in alembic

from src.auth import models # noqa: F401
from src.auth import models as auth_models # noqa: F401
from src.events import models as event_models # noqa: F401

0 comments on commit 74092ed

Please sign in to comment.