update epu srcipts and outputs

worldbank · Mar 21, 2024 · fbebfcc · fbebfcc
1 parent 3f8e32b
commit fbebfcc
Show file tree

Hide file tree

Showing 55 changed files with 19,407 additions and 9,218 deletions.
diff --git a/docs/images/interactive/text/ep_sentiment.html b/docs/images/interactive/text/ep_sentiment.html
diff --git a/docs/images/interactive/text/epu_pic.html b/docs/images/interactive/text/epu_pic.html
diff --git a/docs/text/epu.md b/docs/text/epu.md
@@ -2,7 +2,7 @@
 
 Uncertainty poses challenges to the stability and growth of economies. The Economic Policy Uncertainty Index (EPU) developed by {cite:t}`baker2016measuring` emerges as a vital tool to measure both economic- and policy-related uncertainties. Sepcifically, they count the frequency of the economic- and policy-related terms' appearance (in relative to the number of all news) on newspapers, and the EPU index would be produced with standardization and normalization:
 
-- Let $ X_{it} = \frac{\text{EPU news in newspaper } i \text{ at time } t}{\text{All news in newspaper } i \text{ at time } t} $ and pre-defined $T_1$ to be the standardization and normalization period;
+- Let $ X_{it} = \frac{\text{EPU news in newspaper } i \text{ at time } t}{\text{All scraped news in newspaper } i \text{ at time } t} $ and pre-defined $T_1$ to be the standardization and normalization period;
 
 - Calculate the standard deviation $\sigma_i$ for each newspaper $i$ over $T_1$.
 
@@ -21,7 +21,7 @@ Beyond the $Z_t$ being arithmetic mean of $Y_{it}$, we develop a weighted scheme
 frameborder="0" marginwidth="0" marginheight="0" width="800" height="750"></iframe>
 </div>
 
-The sentiment of economic policy-related news could act as a barometer for uncertainty. Analyzing the shifts and changes in the sentiment of news articles and reports pertaining to economic policy could reflect the underlying economic uncertainty.
+The sentiment of economic policy-related news could act as a barometer for uncertainty. Analyzing the shifts and changes in the sentiment of news articles and reports pertaining to economic policy could reflect the underlying economic uncertainty. To construct the sentiment scores, we firstly filters news identified as "economic" and "policy," calculates the sentiment (polarity) score for each news, and aggregates scores by each month.
 
 <div>
 <iframe src="https://worldbank.github.io/pacific-observatory/interactive/text/ep_sentiment.html"

diff --git a/notebooks/text/EPU.ipynb b/notebooks/text/EPU.ipynb
diff --git a/notebooks/text/EPU_viz.ipynb b/notebooks/text/EPU_viz.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 4,
    "id": "109cd21c",
    "metadata": {
     "scrolled": true
@@ -26,13 +26,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 6,
    "id": "17b0ce40",
    "metadata": {},
    "outputs": [],
    "source": [
     "countries = [\n",
-    "    \"pacific\", \"papua_new_guinea\", \"solomon_islands\", \"fiji\", \"samoa\",\n",
+    "    \"pacific\",\n",
+    "    \"papua_new_guinea\", \"solomon_islands\", \"fiji\", \"samoa\",\n",
     "    \"vanuatu\"\n",
     "]\n",
     "\n",

diff --git a/notebooks/text/Sentiment.ipynb b/notebooks/text/Sentiment.ipynb
@@ -26,10 +26,13 @@
     "from bokeh.layouts import Row, column, gridplot\n",
     "from bokeh.models import (Title, Legend, ColumnDataSource, Select, HoverTool,\n",
     "                          BoxZoomTool, ResetTool, DataTable, DateFormatter,\n",
-    "                          TableColumn)\n",
+    "                          TableColumn, LinearAxis, LegendItem, CustomJS, Toggle)\n",
     "from bokeh.models.layouts import TabPanel, Tabs\n",
     "from bokeh.plotting import figure, output_file, show, output_notebook\n",
-    "output_file(filename=sys.path[0] + \"docs/images/interactive/text/ep_sentiment.html\")"
+    "from bokeh.palettes import Category20\n",
+    "\n",
+    "output_file(filename=sys.path[0] +\n",
+    "            \"docs/images/interactive/text/ep_sentiment.html\")"
    ]
   },
   {
@@ -65,42 +68,41 @@
     "            pbar.update(1)\n",
     "    return results\n",
     "\n",
+    "\n",
     "def calculate_sentiment(df):\n",
     "\n",
-    "    df = df[(df.econ == True)].reset_index(drop=True)\n",
+    "    df = df[(df.econ) & (df.policy)].reset_index(drop=True)\n",
     "\n",
     "    sent_res = sentiment_analysis(df)\n",
     "\n",
     "    df[\"score\"] = [i[\"compound\"] for i in sent_res]\n",
     "    df[\"date\"] = df[\"date\"].apply(lambda x: parse(str(x)).date())\n",
     "    df[\"date\"] = pd.to_datetime(df[\"date\"])\n",
-    "    month_sent = (df.set_index(\"date\")\n",
-    "                    .groupby(pd.Grouper(freq=\"MS\"))[[\"score\"]]\n",
-    "                    .mean()\n",
-    "                    .reset_index())\n",
+    "    month_sent = (df.set_index(\"date\").groupby(\n",
+    "        pd.Grouper(freq=\"MS\"))[[\"score\"]].mean().reset_index())\n",
     "\n",
-    "    return month_sent"
+    "    return month_sent, df.score.mean(), df.score.std()"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 4,
    "id": "e0b8b9c7",
    "metadata": {
-    "scrolled": false
+    "scrolled": true
    },
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5065/5065 [00:15<00:00, 317.71it/s]\n",
-      "100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 83/83 [00:00<00:00, 191.30it/s]\n",
-      "100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1220/1220 [00:03<00:00, 398.76it/s]\n",
-      "100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8180/8180 [00:16<00:00, 487.82it/s]\n",
-      "100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8252/8252 [00:20<00:00, 409.74it/s]\n",
-      "100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4430/4430 [00:09<00:00, 460.15it/s]\n",
-      "100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9506/9506 [00:28<00:00, 334.85it/s]\n"
+      "100%|███████████████████████████████████████████████████████████████████████████████████| 4474/4474 [00:15<00:00, 294.10it/s]\n",
+      "100%|███████████████████████████████████████████████████████████████████████████████████████| 68/68 [00:00<00:00, 184.43it/s]\n",
+      "100%|█████████████████████████████████████████████████████████████████████████████████████| 963/963 [00:02<00:00, 350.31it/s]\n",
+      "100%|███████████████████████████████████████████████████████████████████████████████████| 7093/7093 [00:15<00:00, 462.35it/s]\n",
+      "100%|███████████████████████████████████████████████████████████████████████████████████| 6646/6646 [00:17<00:00, 381.18it/s]\n",
+      "100%|███████████████████████████████████████████████████████████████████████████████████| 3469/3469 [00:08<00:00, 419.12it/s]\n",
+      "100%|███████████████████████████████████████████████████████████████████████████████████| 6967/6967 [00:23<00:00, 298.41it/s]\n"
      ]
     }
    ],
@@ -109,90 +111,142 @@
     "for country in country_dirs:\n",
     "    country_name = country.split(\"/\")[-1]\n",
     "    news_dirs = [\n",
-    "        f\"{country}/{file}\" for file in os.listdir(country) if \"news\" in file\n",
+    "        f\"{country}/{file}\" for file in os.listdir(country)\n",
+    "        if \"news\" in file and \"ner\" not in file\n",
     "    ]\n",
     "\n",
     "    e = EPU(news_dirs, cutoff=None)\n",
     "    e.get_epu_category(\n",
     "        subset_condition=\"date >= '2016-01-01' and date < '2024-01-01'\")\n",
-    "    \n",
+    "\n",
     "    dfs = pd.DataFrame()\n",
     "    for _, df in e.raw_files:\n",
     "        df_select = df[[\"news\", \"date\", \"econ\", \"policy\"]]\n",
     "        dfs = pd.concat([dfs, df_select], axis=0).reset_index(drop=True)\n",
-    "    \n",
-    "    sent_df = calculate_sentiment(dfs)\n",
+    "\n",
+    "    sent_df, sent_mean, sent_std = calculate_sentiment(dfs)\n",
     "\n",
     "    min_date = str(sent_df.date.min().date())\n",
     "    max_date = str(sent_df.date.max().date())\n",
     "\n",
     "    sent_df = generate_continous_df(sent_df, min_date, max_date, freq=\"MS\")\n",
-    "    sent_mean, sent_std = sent_df.score.mean(), sent_df.score.std()\n",
     "    sent_df[\"z_score\"] = sent_df[\"score\"].apply(lambda x:\n",
     "                                                (x - sent_mean) / sent_std)\n",
-    "    sent_dfs.append((country_name, sent_df))\n",
-    "    \n",
+    "    sent_dfs.append((country_name, sent_df, sent_mean))\n",
+    "\n",
     "    saved_folder = output_dir + f\"{country_name}/sentiment/\"\n",
     "    if not os.path.exists(saved_folder):\n",
     "        os.mkdir(saved_folder)\n",
-    "    \n",
-    "    sent_df.to_csv(saved_folder + f\"{country_name}_sentiment.csv\", encoding=\"utf-8\")"
+    "\n",
+    "    sent_df.to_csv(saved_folder + f\"{country_name}_sentiment.csv\",\n",
+    "                   encoding=\"utf-8\")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
-   "id": "a77e2c0c",
+   "execution_count": 5,
+   "id": "5d1cc6db",
    "metadata": {},
    "outputs": [],
    "source": [
     "tabs = []\n",
-    "for name, df in sent_dfs:\n",
+    "z_score_df = pd.DataFrame()\n",
+    "for name, df, sentiment in sent_dfs:\n",
     "    if name != \"tonga\":\n",
+    "        temp = df[[\"date\", \"z_score\"]].rename({\"z_score\": name}, axis=1)\n",
+    "        if z_score_df.empty:\n",
+    "            z_score_df = temp\n",
+    "        else:\n",
+    "            z_score_df = z_score_df.merge(temp, how=\"outer\", on=\"date\")\n",
+    "\n",
+    "z_score_df = z_score_df.round(3)\n",
+    "source = ColumnDataSource(z_score_df)\n",
+    "hover = HoverTool(tooltips=[('Date', '@date{%Y-%m}'), ('Z-Score', '@$name')],\n",
+    "                  formatters={'@date': 'datetime'})\n",
+    "p = figure(height=400,\n",
+    "           width=750,\n",
+    "           x_axis_type=\"datetime\",\n",
+    "           tools=[hover, BoxZoomTool(), ResetTool()])\n",
+    "\n",
+    "columns = [\n",
+    "    TableColumn(field=\"date\", title=\"Date\", formatter=DateFormatter()),\n",
+    "]\n",
+    "colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b']\n",
+    "line_styles = ['solid', 'dashed', 'dotted', 'dotdash', 'dashdot', 'solid']\n",
+    "toggles = []\n",
+    "\n",
+    "for (idx, name) in enumerate(z_score_df.columns[1:]):\n",
+    "    if name != \"tonga\":\n",
+    "        prettfied_name = \" \".join(w[0].upper() + w[1:]\n",
+    "                                  for w in name.split(\"_\"))\n",
+    "        line = p.line(\"date\",\n",
+    "                      name,\n",
+    "                      source=z_score_df,\n",
+    "                      name=name,\n",
+    "                      line_width=2,\n",
+    "                      color=colors[idx],\n",
+    "                      line_dash=line_styles[idx])\n",
+    "        line.visible = idx < 1\n",
+    "        toggle = Toggle(label=f\"{prettfied_name}\",\n",
+    "                        button_type=\"light\",\n",
+    "                        active=(idx < 1))\n",
+    "        toggle.js_link('active', line, 'visible')\n",
+    "\n",
+    "        toggles.append(toggle)\n",
+    "        columns.append(TableColumn(field=name, title=prettfied_name))\n",
     "\n",
+    "dt = DataTable(source=source, columns=columns, width=720, height=300)\n",
+    "combined = column(Row(*toggles), p, dt)\n",
+    "tabs.append(TabPanel(child=combined, title=\"Comparison\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "22778802",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for name, df, sentiment in sent_dfs:\n",
+    "    if name != \"tonga\":\n",
+    "        df = df.round(3)\n",
     "        source = ColumnDataSource(df)\n",
     "\n",
     "        hover = HoverTool(tooltips=[('Date', '@date{%Y-%m}'),\n",
-    "                                    ('Sentiment', '@score'),\n",
-    "                                    ('Z-Score', '@z_score')],\n",
+    "                                    ('Sentiment', '@score')],\n",
     "                          formatters={'@date': 'datetime'})\n",
     "\n",
     "        p = figure(height=400,\n",
     "                   width=750,\n",
+    "                   y_range=(df.score.min() - 0.1, df.score.max() + 0.1),\n",
     "                   x_axis_type=\"datetime\",\n",
-    "                   tools=[hover, BoxZoomTool(), ResetTool()])\n",
+    "                   tools=[hover, BoxZoomTool(),\n",
+    "                          ResetTool()])\n",
     "\n",
+    "        # Define the primary y-axis for 'score'\n",
     "        p.line(\"date\",\n",
     "               \"score\",\n",
     "               source=source,\n",
     "               name=\"sentiment\",\n",
     "               line_width=2,\n",
     "               legend_label=\"Sentiment\")\n",
     "\n",
-    "        p.line(\"date\",\n",
-    "               \"z_score\",\n",
-    "               source=source,\n",
-    "               name=\"epu_unweighted\",\n",
-    "               color='darkorange',\n",
-    "               line_width=1.5,\n",
-    "               legend_label=\"Z score\")\n",
-    "\n",
+    "        p.hspan(y=sentiment, line_width=2, line_dash=[\"dashed\"])\n",
     "        p.legend.location = \"top_left\"\n",
     "        p.legend.click_policy = \"mute\"\n",
     "\n",
     "        columns = [\n",
     "            TableColumn(field=\"date\", title=\"Date\", formatter=DateFormatter()),\n",
-    "            TableColumn(field=\"score\", title=\"Sentiment (Avg.)\"),\n",
-    "            TableColumn(field=\"z_score\", title=\"Z Score\"),\n",
+    "            TableColumn(field=\"score\", title=\"Sentiment (Avg.)\")\n",
     "        ]\n",
-    "        dt = DataTable(source=source, columns=columns, width=700, height=300)\n",
+    "        dt = DataTable(source=source, columns=columns, width=720, height=300)\n",
     "        combined = column(p, dt)\n",
     "\n",
     "        # Uppercase the first letter of the country name\n",
     "        title = \" \".join(w[0].upper() + w[1:] for w in name.split(\"_\"))\n",
     "        tab = TabPanel(child=combined, title=title)\n",
     "        tabs.append(tab)\n",
-    "    \n",
+    "\n",
     "show(Tabs(tabs=tabs))"
    ]
   }