Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add event preprocessing #17

Merged
merged 5 commits into from
Sep 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
"""Create article and event tables

Revision ID: 423a1e6552f0
Revises: 681607ac9aeb
Create Date: 2024-09-21 16:44:37.663284

"""

from typing import Sequence, Union

from alembic import op
import sqlalchemy as sa
from sqlalchemy.dialects import postgresql

# revision identifiers, used by Alembic.
revision: str = "423a1e6552f0"
down_revision: Union[str, None] = "681607ac9aeb"
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None


def upgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
sa.Enum("CNA", "GUARDIAN", name="articlesource").create(op.get_bind())
op.create_table(
"article",
sa.Column("id", sa.Integer(), nullable=False),
sa.Column("title", sa.String(), nullable=False),
sa.Column("summary", sa.String(), nullable=False),
sa.Column("body", sa.String(), nullable=False),
sa.Column("url", sa.String(), nullable=False),
sa.Column(
"source",
postgresql.ENUM("CNA", "GUARDIAN", name="articlesource", create_type=False),
nullable=False,
),
sa.PrimaryKeyConstraint("id"),
)
op.create_table(
"category",
sa.Column("id", sa.Integer(), nullable=False),
sa.Column("name", sa.String(), nullable=False),
sa.PrimaryKeyConstraint("id"),
)
op.create_table(
"event",
sa.Column("id", sa.Integer(), nullable=False),
sa.Column("title", sa.String(), nullable=False),
sa.Column("description", sa.String(), nullable=False),
sa.Column("analysis", sa.String(), nullable=False),
sa.Column("duplicate", sa.Boolean(), nullable=False),
sa.Column("date", sa.DateTime(), nullable=False),
sa.PrimaryKeyConstraint("id"),
)
op.create_table(
"event_category",
sa.Column("event_id", sa.Integer(), nullable=False),
sa.Column("category_id", sa.Integer(), nullable=False),
sa.ForeignKeyConstraint(
["category_id"],
["category.id"],
),
sa.ForeignKeyConstraint(
["event_id"],
["event.id"],
),
sa.PrimaryKeyConstraint("event_id", "category_id"),
)
# ### end Alembic commands ###


def downgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
op.drop_table("event_category")
op.drop_table("event")
op.drop_table("category")
op.drop_table("article")
sa.Enum("CNA", "GUARDIAN", name="articlesource").drop(op.get_bind())
# ### end Alembic commands ###
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
"""Add columns to event table

Revision ID: e84e13568072
Revises: 423a1e6552f0
Create Date: 2024-09-21 17:21:24.515814

"""

from typing import Sequence, Union

from alembic import op
import sqlalchemy as sa


# revision identifiers, used by Alembic.
revision: str = "e84e13568072"
down_revision: Union[str, None] = "423a1e6552f0"
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None


def upgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
op.add_column("event", sa.Column("is_singapore", sa.Boolean(), nullable=False))
op.add_column(
"event", sa.Column("original_article_id", sa.Integer(), nullable=False)
)
op.create_foreign_key(None, "event", "article", ["original_article_id"], ["id"])
# ### end Alembic commands ###


def downgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
op.drop_constraint(None, "event", type_="foreignkey")
op.drop_column("event", "original_article_id")
op.drop_column("event", "is_singapore")
# ### end Alembic commands ###
31 changes: 31 additions & 0 deletions backend/alembic/versions/f3e847c3ee9d_add_article_date_column.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
"""Add article date column

Revision ID: f3e847c3ee9d
Revises: e84e13568072
Create Date: 2024-09-21 18:25:05.912853

"""

from typing import Sequence, Union

from alembic import op
import sqlalchemy as sa


# revision identifiers, used by Alembic.
revision: str = "f3e847c3ee9d"
down_revision: Union[str, None] = "e84e13568072"
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None


def upgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
op.add_column("article", sa.Column("date", sa.DateTime(), nullable=False))
# ### end Alembic commands ###


def downgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
op.drop_column("article", "date")
# ### end Alembic commands ###
63 changes: 63 additions & 0 deletions backend/src/events/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
from enum import Enum
from sqlalchemy.orm import Mapped, mapped_column, relationship
from sqlalchemy import ForeignKey
from datetime import datetime
from src.common.base import Base


class ArticleSource(str, Enum):
CNA = "CNA"
GUARDIAN = "GUARDIAN"


class Article(Base):
__tablename__ = "article"

id: Mapped[int] = mapped_column(primary_key=True)
title: Mapped[str]
summary: Mapped[str]
body: Mapped[str]
url: Mapped[str]
source: Mapped[ArticleSource]
date: Mapped[datetime]

event: Mapped["Event"] = relationship(back_populates="original_article")


class Event(Base):
__tablename__ = "event"

id: Mapped[int] = mapped_column(primary_key=True)
title: Mapped[str]
description: Mapped[str]
analysis: Mapped[str]
duplicate: Mapped[bool]
date: Mapped[datetime]
is_singapore: Mapped[bool]
original_article_id: Mapped[int] = mapped_column(ForeignKey("article.id"))

categories: Mapped[list["Category"]] = relationship(
back_populates="events", secondary="event_category"
)

original_article: Mapped[Article] = relationship(back_populates="event")


class Category(Base):
__tablename__ = "category"

id: Mapped[int] = mapped_column(primary_key=True)
name: Mapped[str]

events: Mapped[list[Event]] = relationship(
secondary="event_category", back_populates="categories"
)


class EventCategory(Base):
__tablename__ = "event_category"

event_id: Mapped[int] = mapped_column(ForeignKey("event.id"), primary_key=True)
category_id: Mapped[int] = mapped_column(
ForeignKey("category.id"), primary_key=True
)
96 changes: 96 additions & 0 deletions backend/src/scrapers/cna/process.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
import argparse
import asyncio
import json
from sqlalchemy.orm import Session
from src.events.models import Article, ArticleSource
from src.common.database import engine
from pydantic import BaseModel, ConfigDict
from bs4 import BeautifulSoup
import os


parser = argparse.ArgumentParser()
parser.add_argument("-i", "--input", help="input folder path")

args = parser.parse_args()
folder_path = args.input


processed_ids = set()

CATEGORIES = [
"Asia",
"East Asia",
"Singapore",
"World",
"Commentary",
"CNA Explains",
"Business",
"Sport",
"CNA Insider",
]


class CNAArticle(BaseModel):
model_config = ConfigDict(extra="allow")

type: str
uuid: str
nid: str
title: str
title_url: str
absolute_url: str
field_summary: str | None = None
description: str | None = None
release_date: str


async def process(category: str):
with open(f"./src/scrapers/cna/data/{category}.json") as f:
data = json.load(f)
for page_index, page in enumerate(data):
for index, item in enumerate(page):
try:
article = CNAArticle.model_validate(item)

if article.type != "article":
continue
if article.uuid in processed_ids:
continue
processed_ids.add(article.uuid)

# Read body text from scrape.py
with open(
os.path.join(folder_path, f"{article.uuid}_{category}.txt")
) as f:
body = f.read()

if not body.strip():
continue

# Add to database
article_orm = Article(
title=article.title,
summary=BeautifulSoup(article.description).getText()
if article.description
else "",
url=article.absolute_url,
source=ArticleSource.CNA,
body=body.strip(),
date=article.release_date,
)
with Session(engine) as session:
session.add(article_orm)
session.commit()

except Exception as e:
print(f"{category}: something went wrong with {page_index}, {index}")
print(e)


async def process_all_categories():
asyncio.gather(*[process(category) for category in CATEGORIES])


if __name__ == "__main__":
asyncio.run(process_all_categories())
43 changes: 43 additions & 0 deletions backend/src/scrapers/guardian/process.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import argparse

import json
from sqlalchemy.orm import Session
from src.events.models import Article, ArticleSource
from src.common.database import engine
from pydantic import BaseModel, ConfigDict


parser = argparse.ArgumentParser()
parser.add_argument("-i", "--input", help="input folder path")
args = parser.parse_args()


class GuardianArticleFields(BaseModel):
model_config = ConfigDict(extra="allow")
bodyText: str
trailText: str | None = None


class GuardianArticle(BaseModel):
model_config = ConfigDict(extra="allow")
fields: GuardianArticleFields
webUrl: str
webTitle: str
webPublicationDate: str


with open(args.input) as f:
data = json.load(f)
for row in data:
article = GuardianArticle.model_validate(row)
article_orm = Article(
title=article.webTitle,
summary=article.fields.trailText if article.fields.trailText else "",
url=article.webUrl,
source=ArticleSource.GUARDIAN,
body=article.fields.bodyText,
date=article.webPublicationDate,
)
with Session(engine) as session:
session.add(article_orm)
session.commit()
3 changes: 2 additions & 1 deletion backend/src/utils/models.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# this is just to load models in alembic

from src.auth import models # noqa: F401
from src.auth import models as auth_models # noqa: F401
from src.events import models as event_models # noqa: F401
Loading