From 9f271a3efb2d6c44e1fa3441d755332643a0545a Mon Sep 17 00:00:00 2001 From: adurivault Date: Fri, 4 Oct 2024 18:22:28 +0200 Subject: [PATCH] feat: add notebook that generates latvian article links --- .gitignore | 2 +- notebooks/latvia/scrape GDELT api.ipynb | 456 ++++++++++++++++++++++++ pyproject.toml | 2 + uv.lock | 15 + 4 files changed, 474 insertions(+), 1 deletion(-) create mode 100644 notebooks/latvia/scrape GDELT api.ipynb diff --git a/.gitignore b/.gitignore index 0a825df..010ab42 100644 --- a/.gitignore +++ b/.gitignore @@ -53,7 +53,7 @@ Thumbs.db *.wmv *.pyc -notebooks/.ipynb_checkpoints +notebooks/**/.ipynb_checkpoints/ .env .env .venv \ No newline at end of file diff --git a/notebooks/latvia/scrape GDELT api.ipynb b/notebooks/latvia/scrape GDELT api.ipynb new file mode 100644 index 0000000..7e64037 --- /dev/null +++ b/notebooks/latvia/scrape GDELT api.ipynb @@ -0,0 +1,456 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "C2H7hlVkt_ep" + }, + "outputs": [], + "source": [ + "from urllib.request import urlopen\n", + "import pandas as pd\n", + "from gdeltdoc import GdeltDoc, Filters" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Get list off all GDELT themes\n", + "We need the list of predefined themes to be able to filter" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
themecount
0TAX_FNCACT999601552
1TAX_ETHNICITY410780218
2EPU_POLICY384818230
3CRISISLEX_CRISISLEXREC373229208
4TAX_WORLDLANGUAGES348186680
.........
59310TAX_WORLDLANGUAGES_PUNAPA1
59311TAX_WORLDBIRDS_SWALLOWTAILED_HUMMINGBIRDS1
59312TAX_WORLDMAMMALS_PACIFIC_DEGU1
59313TAX_WORLDBIRDS_FLAMECRESTED_TANAGER1
59314TAX_WORLDLANGUAGES_BOROAS1
\n", + "

59315 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " theme count\n", + "0 TAX_FNCACT 999601552\n", + "1 TAX_ETHNICITY 410780218\n", + "2 EPU_POLICY 384818230\n", + "3 CRISISLEX_CRISISLEXREC 373229208\n", + "4 TAX_WORLDLANGUAGES 348186680\n", + "... ... ...\n", + "59310 TAX_WORLDLANGUAGES_PUNAPA 1\n", + "59311 TAX_WORLDBIRDS_SWALLOWTAILED_HUMMINGBIRDS 1\n", + "59312 TAX_WORLDMAMMALS_PACIFIC_DEGU 1\n", + "59313 TAX_WORLDBIRDS_FLAMECRESTED_TANAGER 1\n", + "59314 TAX_WORLDLANGUAGES_BOROAS 1\n", + "\n", + "[59315 rows x 2 columns]" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "THEMES_URL = \"http://data.gdeltproject.org/api/v2/guides/LOOKUP-GKGTHEMES.TXT\"\n", + "\n", + "\n", + "def get_themes(url: str) -> pd.DataFrame: \n", + " # Fetch the content using urllib\n", + " with urlopen(url) as response:\n", + " data = response.read().decode()\n", + " \n", + " # Split the data into lines\n", + " lines = data.strip().split(\"\\n\")\n", + " \n", + " # Split each line into key-value pairs\n", + " rows = [line.split(\"\\t\") for line in lines]\n", + " \n", + " # Create a DataFrame from the rows\n", + " df = pd.DataFrame(rows, columns=['theme', 'count'])\n", + " df['count'] = df['count'].astype(int)\n", + " \n", + " return df\n", + "\n", + "def get_climate_themes(themes_df) -> list[str] : \n", + " return themes_df[themes_df[\"theme\"].str.contains(\"CLIMATE\")][\"theme\"].to_list()\n", + "\n", + "themes_df = get_themes(THEMES_URL)\n", + "themes_df" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['WB_405_BUSINESS_CLIMATE',\n", + " 'WB_567_CLIMATE_CHANGE',\n", + " 'ENV_CLIMATECHANGE',\n", + " 'UNGP_CLIMATE_CHANGE_ACTION',\n", + " 'WB_1949_CLIMATE_SMART_AGRICULTURE',\n", + " 'WB_568_CLIMATE_SERVICES',\n", + " 'WB_579_CLIMATE_CHANGE_MITIGATION',\n", + " 'WB_571_CLIMATE_SCIENCE',\n", + " 'WB_1841_SHORT_LIVED_CLIMATE_POLLUTANTS',\n", + " 'WB_1844_MARKET_BASED_CLIMATE_CHANGE_MITIGATION',\n", + " 'WB_1773_CLIMATE_CHANGE_IMPACTS',\n", + " 'WB_1847_CLIMATE_FINANCE',\n", + " 'WB_574_CLIMATE_CHANGE_ADAPTATION',\n", + " 'WB_959_CLIMATE_CHANGE_LAW',\n", + " 'WB_747_SOCIAL_RESILIENCE_AND_CLIMATE_CHANGE',\n", + " 'WB_1774_CLIMATE_FORECASTING',\n", + " 'WB_2673_JOBS_AND_CLIMATE_CHANGE',\n", + " 'TAX_AIDGROUPS_CLIMATE_ACTION_NETWORK',\n", + " 'WB_572_CLIMATE_RESILIENT_DEVELOPMENT',\n", + " 'WB_2639_CLIMATE_EFFICIENT_INDUSTRIES',\n", + " 'WB_573_CLIMATE_RISK_MANAGEMENT',\n", + " 'WB_1849_PUBLIC_CLIMATE_FINANCE',\n", + " 'WB_1838_CLIMATE_RISK_SCREENING',\n", + " 'WB_1850_PRIVATE_CLIMATE_FINANCE',\n", + " 'WB_1839_OZONE_LAYER_DEPLETION_AND_CLIMATE_CHANGE',\n", + " 'WB_575_COMMUNITY_BASED_CLIMATE_ADAPTATION',\n", + " 'WB_1750_CLIMATE_CHANGE_ADAPTATION_IMPACTS']" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "climate_themes = get_climate_themes(themes_df)\n", + "climate_themes" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Scrape gdlet api fo latvian climate articles\n", + "We will use this wrapper around the GDELT api : https://github.com/alex9smith/gdelt-doc-api" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "250 articles found for theme WB_405_BUSINESS_CLIMATE, in 2022\n", + "250 articles found for theme WB_405_BUSINESS_CLIMATE, in 2023\n", + "250 articles found for theme WB_405_BUSINESS_CLIMATE, in 2024\n", + "250 articles found for theme WB_567_CLIMATE_CHANGE, in 2022\n", + "205 articles found for theme WB_567_CLIMATE_CHANGE, in 2023\n", + "250 articles found for theme WB_567_CLIMATE_CHANGE, in 2024\n", + "250 articles found for theme ENV_CLIMATECHANGE, in 2022\n", + "151 articles found for theme ENV_CLIMATECHANGE, in 2023\n", + "150 articles found for theme ENV_CLIMATECHANGE, in 2024\n", + "250 articles found for theme UNGP_CLIMATE_CHANGE_ACTION, in 2022\n", + "156 articles found for theme UNGP_CLIMATE_CHANGE_ACTION, in 2023\n", + "159 articles found for theme UNGP_CLIMATE_CHANGE_ACTION, in 2024\n", + "149 articles found for theme WB_1949_CLIMATE_SMART_AGRICULTURE, in 2022\n", + "73 articles found for theme WB_1949_CLIMATE_SMART_AGRICULTURE, in 2023\n", + "78 articles found for theme WB_1949_CLIMATE_SMART_AGRICULTURE, in 2024\n", + "126 articles found for theme WB_568_CLIMATE_SERVICES, in 2022\n", + "70 articles found for theme WB_568_CLIMATE_SERVICES, in 2023\n", + "124 articles found for theme WB_568_CLIMATE_SERVICES, in 2024\n", + "26 articles found for theme WB_579_CLIMATE_CHANGE_MITIGATION, in 2022\n", + "9 articles found for theme WB_579_CLIMATE_CHANGE_MITIGATION, in 2023\n", + "7 articles found for theme WB_579_CLIMATE_CHANGE_MITIGATION, in 2024\n", + "7 articles found for theme WB_571_CLIMATE_SCIENCE, in 2022\n", + "14 articles found for theme WB_571_CLIMATE_SCIENCE, in 2023\n", + "6 articles found for theme WB_571_CLIMATE_SCIENCE, in 2024\n", + "10 articles found for theme WB_1841_SHORT_LIVED_CLIMATE_POLLUTANTS, in 2022\n", + "4 articles found for theme WB_1841_SHORT_LIVED_CLIMATE_POLLUTANTS, in 2023\n", + "3 articles found for theme WB_1841_SHORT_LIVED_CLIMATE_POLLUTANTS, in 2024\n", + "7 articles found for theme WB_1844_MARKET_BASED_CLIMATE_CHANGE_MITIGATION, in 2022\n", + "2 articles found for theme WB_1844_MARKET_BASED_CLIMATE_CHANGE_MITIGATION, in 2023\n", + "0 articles found for theme WB_1844_MARKET_BASED_CLIMATE_CHANGE_MITIGATION, in 2024\n", + "6 articles found for theme WB_1773_CLIMATE_CHANGE_IMPACTS, in 2022\n", + "12 articles found for theme WB_1773_CLIMATE_CHANGE_IMPACTS, in 2023\n", + "5 articles found for theme WB_1773_CLIMATE_CHANGE_IMPACTS, in 2024\n", + "4 articles found for theme WB_1847_CLIMATE_FINANCE, in 2022\n", + "1 articles found for theme WB_1847_CLIMATE_FINANCE, in 2023\n", + "0 articles found for theme WB_1847_CLIMATE_FINANCE, in 2024\n", + "5 articles found for theme WB_574_CLIMATE_CHANGE_ADAPTATION, in 2022\n", + "2 articles found for theme WB_574_CLIMATE_CHANGE_ADAPTATION, in 2023\n", + "0 articles found for theme WB_574_CLIMATE_CHANGE_ADAPTATION, in 2024\n", + "0 articles found for theme WB_959_CLIMATE_CHANGE_LAW, in 2022\n", + "2 articles found for theme WB_959_CLIMATE_CHANGE_LAW, in 2023\n", + "0 articles found for theme WB_959_CLIMATE_CHANGE_LAW, in 2024\n", + "1 articles found for theme WB_747_SOCIAL_RESILIENCE_AND_CLIMATE_CHANGE, in 2022\n", + "0 articles found for theme WB_747_SOCIAL_RESILIENCE_AND_CLIMATE_CHANGE, in 2023\n", + "1 articles found for theme WB_747_SOCIAL_RESILIENCE_AND_CLIMATE_CHANGE, in 2024\n", + "0 articles found for theme WB_1774_CLIMATE_FORECASTING, in 2022\n", + "1 articles found for theme WB_1774_CLIMATE_FORECASTING, in 2023\n", + "0 articles found for theme WB_1774_CLIMATE_FORECASTING, in 2024\n", + "2 articles found for theme WB_2673_JOBS_AND_CLIMATE_CHANGE, in 2022\n", + "0 articles found for theme WB_2673_JOBS_AND_CLIMATE_CHANGE, in 2023\n", + "0 articles found for theme WB_2673_JOBS_AND_CLIMATE_CHANGE, in 2024\n", + "2 articles found for theme TAX_AIDGROUPS_CLIMATE_ACTION_NETWORK, in 2022\n", + "0 articles found for theme TAX_AIDGROUPS_CLIMATE_ACTION_NETWORK, in 2023\n", + "0 articles found for theme TAX_AIDGROUPS_CLIMATE_ACTION_NETWORK, in 2024\n", + "0 articles found for theme WB_572_CLIMATE_RESILIENT_DEVELOPMENT, in 2022\n", + "0 articles found for theme WB_572_CLIMATE_RESILIENT_DEVELOPMENT, in 2023\n", + "0 articles found for theme WB_572_CLIMATE_RESILIENT_DEVELOPMENT, in 2024\n", + "0 articles found for theme WB_2639_CLIMATE_EFFICIENT_INDUSTRIES, in 2022\n", + "0 articles found for theme WB_2639_CLIMATE_EFFICIENT_INDUSTRIES, in 2023\n", + "0 articles found for theme WB_2639_CLIMATE_EFFICIENT_INDUSTRIES, in 2024\n", + "0 articles found for theme WB_573_CLIMATE_RISK_MANAGEMENT, in 2022\n", + "0 articles found for theme WB_573_CLIMATE_RISK_MANAGEMENT, in 2023\n", + "0 articles found for theme WB_573_CLIMATE_RISK_MANAGEMENT, in 2024\n", + "2 articles found for theme WB_1849_PUBLIC_CLIMATE_FINANCE, in 2022\n", + "0 articles found for theme WB_1849_PUBLIC_CLIMATE_FINANCE, in 2023\n", + "0 articles found for theme WB_1849_PUBLIC_CLIMATE_FINANCE, in 2024\n", + "0 articles found for theme WB_1838_CLIMATE_RISK_SCREENING, in 2022\n", + "0 articles found for theme WB_1838_CLIMATE_RISK_SCREENING, in 2023\n", + "0 articles found for theme WB_1838_CLIMATE_RISK_SCREENING, in 2024\n", + "0 articles found for theme WB_1850_PRIVATE_CLIMATE_FINANCE, in 2022\n", + "0 articles found for theme WB_1850_PRIVATE_CLIMATE_FINANCE, in 2023\n", + "0 articles found for theme WB_1850_PRIVATE_CLIMATE_FINANCE, in 2024\n", + "0 articles found for theme WB_1839_OZONE_LAYER_DEPLETION_AND_CLIMATE_CHANGE, in 2022\n", + "0 articles found for theme WB_1839_OZONE_LAYER_DEPLETION_AND_CLIMATE_CHANGE, in 2023\n", + "0 articles found for theme WB_1839_OZONE_LAYER_DEPLETION_AND_CLIMATE_CHANGE, in 2024\n", + "0 articles found for theme WB_575_COMMUNITY_BASED_CLIMATE_ADAPTATION, in 2022\n", + "0 articles found for theme WB_575_COMMUNITY_BASED_CLIMATE_ADAPTATION, in 2023\n", + "0 articles found for theme WB_575_COMMUNITY_BASED_CLIMATE_ADAPTATION, in 2024\n", + "0 articles found for theme WB_1750_CLIMATE_CHANGE_ADAPTATION_IMPACTS, in 2022\n", + "0 articles found for theme WB_1750_CLIMATE_CHANGE_ADAPTATION_IMPACTS, in 2023\n", + "0 articles found for theme WB_1750_CLIMATE_CHANGE_ADAPTATION_IMPACTS, in 2024\n" + ] + } + ], + "source": [ + "partial_articles_dfs = []\n", + "\n", + "for theme in climate_themes: \n", + " for year in [2022, 2023, 2024]: \n", + " f = Filters(\n", + " #keyword = \"climate change\",\n", + " start_date = f\"{year}-01-01\",\n", + " end_date = f\"{year}-12-31\", \n", + " theme = theme, \n", + " country = \"LG\", \n", + " )\n", + " \n", + " gd = GdeltDoc()\n", + " \n", + " # Search for articles matching the filters\n", + " partial_articles_df = gd.article_search(f)\n", + " print(f\"{len(partial_articles_df)} articles found for theme {theme}, in {year}\")\n", + " if partial_articles_df.empty: \n", + " continue\n", + " partial_articles_dfs.append(partial_articles_df)\n", + "\n", + "articles_df = pd.concat(partial_articles_dfs)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Deleting 1191 duplicates\n", + "1683 unique articles found\n" + ] + } + ], + "source": [ + "articles_df = articles_df[articles_df[\"language\"] == \"Latvian\"]\n", + "articles_df[\"seendate\"] = pd.to_datetime(articles_df[\"seendate\"])\n", + "\n", + "print(f\"Deleting {articles_df[\"url\"].duplicated().sum()} duplicates\")\n", + "articles_df = articles_df.drop_duplicates(\"url\")\n", + "print(f\"{len(articles_df)} unique articles found\")\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "domain\n", + "nra.lv 459\n", + "lsm.lv 388\n", + "delfi.lv 351\n", + "la.lv 225\n", + "diena.lv 65\n", + "reitingi.lv 60\n", + "ogrenet.lv 30\n", + "bnn.lv 20\n", + "tvnet.lv 19\n", + "ventasbalss.lv 19\n", + "ir.lv 17\n", + "mfa.gov.lv 13\n", + "ntz.lv 5\n", + "president.lv 5\n", + "latgaleslaiks.lv 3\n", + "vm.gov.lv 2\n", + "220.lv 1\n", + "brivalatvija.lv 1\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "articles_df[\"domain\"].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "articles_df.to_csv(\"../data/latvian_article_links.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.6" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/pyproject.toml b/pyproject.toml index 6fee931..c3f2dc1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,6 +12,8 @@ dependencies = [ "openai>=1.51.0", "python-dotenv>=1.0.1", "anthropic>=0.34.2", + "gdeltdoc>=1.5.0", + "newspaper3k>=0.2.8", ] requires-python = ">=3.12" diff --git a/uv.lock b/uv.lock index 424bcb9..6147788 100644 --- a/uv.lock +++ b/uv.lock @@ -119,6 +119,7 @@ source = { editable = "." } dependencies = [ { name = "anthropic" }, { name = "beautifulsoup4" }, + { name = "gdeltdoc" }, { name = "gradio" }, { name = "newspaper3k" }, { name = "openai" }, @@ -130,6 +131,7 @@ dependencies = [ requires-dist = [ { name = "anthropic", specifier = ">=0.34.2" }, { name = "beautifulsoup4", specifier = ">=4.10.0" }, + { name = "gdeltdoc", specifier = ">=1.5.0" }, { name = "gradio", specifier = ">=4.44.1" }, { name = "newspaper3k", specifier = ">=0.2.8" }, { name = "openai", specifier = ">=1.51.0" }, @@ -299,6 +301,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/1d/a0/6aaea0c2fbea2f89bfd5db25fb1e3481896a423002ebe4e55288907a97a3/fsspec-2024.9.0-py3-none-any.whl", hash = "sha256:a0947d552d8a6efa72cc2c730b12c41d043509156966cca4fb157b0f2a0c574b", size = 179253 }, ] +[[package]] +name = "gdeltdoc" +version = "1.5.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pandas" }, + { name = "requests" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/7e/95/09d4213fe00455ef2ff0151cda312b024b19d6b2b687ec3dfe8cee5ec2db/gdeltdoc-1.5.0.tar.gz", hash = "sha256:3ad0726a03325f5ea76c6bf9c00bd2680c624866130b3bc6bc90cb7297327dc3", size = 11465 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ef/da/79ab86dafbc96c67f9551b3bf1b93e97c1de687b5bab4196ed50a28ddcdc/gdeltdoc-1.5.0-py3-none-any.whl", hash = "sha256:e14775fdfe07ecd2781a1fbb836b9e97b2de3435a8ec468f1a780a3a6a8c13ec", size = 13088 }, +] + [[package]] name = "gradio" version = "4.44.1"