Skip to content

Commit

Permalink
Merge pull request #3 from dataforgoodfr/feat/multi_step_scrapping
Browse files Browse the repository at this point in the history
Feat/multi step scrapping
  • Loading branch information
SaboniAmine authored Oct 5, 2024
2 parents 6ccdf3c + dd2fb2a commit 41198c6
Show file tree
Hide file tree
Showing 9 changed files with 1,665 additions and 132 deletions.
4 changes: 2 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
# https://www.atlassian.com/git/tutorials/saving-changes/gitignore

.env
data/*
data/.ipynb_checkpoints/


# Node artifact files
Expand Down Expand Up @@ -53,7 +53,7 @@ Thumbs.db
*.wmv

*.pyc
notebooks/.ipynb_checkpoints
notebooks/**/.ipynb_checkpoints/
.env
.env
.venv
Expand Down
76 changes: 76 additions & 0 deletions climateguard/gdelt_scrapper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
from urllib.request import urlopen
import pandas as pd
import gdeltdoc as gdelt
import functools
import itertools
from pathlib import Path

class GDELTScrapper:
THEMES_URL = "http://data.gdeltproject.org/api/v2/guides/LOOKUP-GKGTHEMES.TXT"

@functools.cached_property
def themes_df(self) -> pd.DataFrame:
# Fetch the content using urllib
with urlopen(self.THEMES_URL) as response:
data = response.read().decode()

# Split the data into lines
lines = data.strip().split("\n")

# Split each line into key-value pairs
rows = [line.split("\t") for line in lines]

# Create a DataFrame from the rows
df = pd.DataFrame(rows, columns=['theme', 'count'])
df['count'] = df['count'].astype(int)

return df

def find_themes_related_to_keyword(self, keyword: str) -> list[str]:
return self.themes_df[self.themes_df["theme"].str.contains(keyword, case=False)]["theme"].to_list()

def find_articles(self, themes: list[str], years: list[int]) -> pd.DataFrame:
partial_articles_dfs = []

gd = gdelt.GdeltDoc()
for theme, year in itertools.product(themes, years):
f = gdelt.Filters(
#keyword = "climate change",
start_date=f"{year}-01-01",
end_date=f"{year}-12-31",
theme=theme,
country="LG", # Latvia
)

partial_articles_df = gd.article_search(f)
print(f"{len(partial_articles_df)} articles found for theme {theme}, in {year}")
partial_articles_dfs.append(partial_articles_df)

articles_df = pd.concat(partial_articles_dfs)

articles_df = articles_df[articles_df["language"] == "Latvian"]
articles_df["seendate"] = pd.to_datetime(articles_df["seendate"])

print(f"Deleting {articles_df["url"].duplicated().sum()} duplicates")
articles_df = articles_df.drop_duplicates("url")
print(f"{len(articles_df)} unique articles found")
return articles_df


# Usage example:
if __name__ == "__main__":
scraper = GDELTScrapper()

# Find themes related to climate
themes = scraper.find_themes_related_to_keyword("CLIMATE")
print(f"Themes related to climate: {themes}")

# Find articles for these themes and year range
articles_df = scraper.find_articles(themes=themes, years=[2022, 2023, 2024])

# This can be used as input for NewsScraper
article_urls = articles_df["url"].to_list()

# Save dataframe to a csv file
file_path = Path(__file__).parent.parent / "data/latvian_article_links.csv"
articles_df.to_csv(file_path)
Loading

0 comments on commit 41198c6

Please sign in to comment.