Skip to content

Commit

Permalink
update code and outputs
Browse files Browse the repository at this point in the history
  • Loading branch information
ccxzhang committed Oct 23, 2023
1 parent d805455 commit 91d5d09
Show file tree
Hide file tree
Showing 22 changed files with 48,907 additions and 46,614 deletions.
270 changes: 135 additions & 135 deletions data/text/abc_au/marshall_islands_abc_news.csv

Large diffs are not rendered by default.

11,418 changes: 5,709 additions & 5,709 deletions data/text/abc_au/pacific_abc_news.csv

Large diffs are not rendered by default.

9,236 changes: 4,618 additions & 4,618 deletions data/text/abc_au/papua_new_guinea_abc_news.csv

Large diffs are not rendered by default.

3,382 changes: 1,691 additions & 1,691 deletions data/text/abc_au/solomon_islands_abc_news.csv

Large diffs are not rendered by default.

1,616 changes: 808 additions & 808 deletions data/text/abc_au/tonga_abc_news.csv

Large diffs are not rendered by default.

1,534 changes: 767 additions & 767 deletions data/text/abc_au/vanuatu_abc_news.csv

Large diffs are not rendered by default.

22,914 changes: 11,514 additions & 11,400 deletions data/text/rnz/fiji_rnz_news.csv

Large diffs are not rendered by default.

21,218 changes: 10,631 additions & 10,587 deletions data/text/rnz/papua_new_guinea_rnz_news.csv

Large diffs are not rendered by default.

16,344 changes: 8,254 additions & 8,090 deletions data/text/rnz/samoa_rnz_news.csv

Large diffs are not rendered by default.

4,084 changes: 2,042 additions & 2,042 deletions data/text/rnz/solomon_islands_rnz_news.csv

Large diffs are not rendered by default.

600 changes: 0 additions & 600 deletions notebooks/text/Google-Uncertainty-Index.ipynb

This file was deleted.

434 changes: 434 additions & 0 deletions notebooks/text/png_epu.ipynb

Large diffs are not rendered by default.

740 changes: 740 additions & 0 deletions notebooks/text/png_gui.ipynb

Large diffs are not rendered by default.

597 changes: 533 additions & 64 deletions notebooks/text/si-classifcation.ipynb

Large diffs are not rendered by default.

355 changes: 300 additions & 55 deletions notebooks/text/solomon_epu.ipynb

Large diffs are not rendered by default.

249 changes: 249 additions & 0 deletions outputs/text/png/png_epu.csv

Large diffs are not rendered by default.

61 changes: 61 additions & 0 deletions outputs/text/png/png_job_graph.html

Large diffs are not rendered by default.

95 changes: 95 additions & 0 deletions outputs/text/png/png_job_gui.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
,date,job_gui,job_cycle
0,2016-01-01,128.99999999999997,-34.038099151105484
1,2016-02-01,259.88000000000005,95.08385433211112
2,2016-03-01,190.45,23.896070454981555
3,2016-04-01,122.55000000000001,-45.76192181455556
4,2016-05-01,255.60000000000008,85.52922210817857
5,2016-06-01,166.83000000000004,-5.000800090888902
6,2016-07-01,55.6,-117.99295067353202
7,2016-08-01,405.0,229.6418466848955
8,2016-09-01,271.0,93.87357874847416
9,2016-10-01,54.94,-123.95953964654507
10,2016-11-01,90.0,-90.6800179970804
11,2016-12-01,196.31,13.840590677879192
12,2017-01-01,162.25,-22.018566949158327
13,2017-02-01,94.17,-91.9084510003661
14,2017-03-01,111.75999999999999,-76.13985170156732
15,2017-04-01,206.0,16.267149891561417
16,2017-05-01,189.8,-1.7769397778007487
17,2017-06-01,125.28,-68.15173978460658
18,2017-07-01,77.99999999999999,-117.29685549285365
19,2017-08-01,132.82,-64.35136640434987
20,2017-09-01,261.3,62.24655304742609
21,2017-10-01,345.96000000000004,145.0192249673167
22,2017-11-01,207.06,4.228491162687021
23,2017-12-01,97.9,-106.82492553459895
24,2018-01-01,343.0,136.3796650800759
25,2018-02-01,195.0,-13.516222522270482
26,2018-03-01,224.20000000000002,13.787873817149205
27,2018-04-01,140.64000000000001,-71.66747945103737
28,2018-05-01,204.0,-10.201822264117766
29,2018-06-01,268.32000000000005,52.22585843043112
30,2018-07-01,263.22,45.23665440289662
31,2018-08-01,143.0,-76.86874555373632
32,2018-09-01,181.72,-40.03000169474217
33,2018-10-01,91.44999999999999,-132.1761811523587
34,2018-11-01,118.03999999999999,-107.45604218535388
35,2018-12-01,292.0,64.64267682544545
36,2019-01-01,614.0,384.7930666353404
37,2019-02-01,434.0,202.95771921354543
38,2019-03-01,370.37,137.50625744697095
39,2019-04-01,315.0,80.32673819074316
40,2019-05-01,279.62,43.146157294915355
41,2019-06-01,161.31000000000003,-76.95910919553762
42,2019-07-01,165.3,-74.76301815352147
43,2019-08-01,315.90000000000003,74.04106736834524
44,2019-09-01,184.20000000000005,-59.45963880741607
45,2019-10-01,149.64000000000001,-95.8284941628163
46,2019-11-01,345.32,98.03160261364329
47,2019-12-01,162.51999999999998,-86.60150774914575
48,2020-01-01,173.85000000000002,-77.12074093897843
49,2020-02-01,275.37,22.53165557786042
50,2020-03-01,142.59,-112.1359705974682
51,2020-04-01,61.199999999999996,-195.43544571917062
52,2020-05-01,162.18,-96.38773079476641
53,2020-06-01,326.36,65.83772115777498
54,2020-07-01,172.87,-89.62779914702514
55,2020-08-01,214.79999999999998,-49.693509001753085
56,2020-09-01,178.98000000000002,-87.52793412647077
57,2020-10-01,267.90999999999997,-0.629216803670829
58,2020-11-01,274.40000000000003,3.8151760540166606
59,2020-12-01,213.11999999999998,-59.522217610970955
60,2021-01-01,310.8,36.09111070571731
61,2021-02-01,541.0,264.2181287848027
62,2021-03-01,174.98999999999998,-103.86847407378613
63,2021-04-01,297.05000000000007,16.111952709750312
64,2021-05-01,324.7199999999999,41.70086116948602
65,2021-06-01,212.10000000000002,-73.00042098112664
66,2021-07-01,306.00000000000006,18.819112205459817
67,2021-08-01,365.40000000000003,76.14102995164632
68,2021-09-01,421.94999999999993,130.6167562706346
69,2021-10-01,259.08000000000004,-34.3228723323204
70,2021-11-01,188.48000000000005,-106.98802719730585
71,2021-12-01,197.64000000000001,-99.88861482743081
72,2022-01-01,506.0,206.4162837990969
73,2022-02-01,330.19,28.558358449554078
74,2022-03-01,362.70000000000005,59.02770617297824
75,2022-04-01,237.25,-68.45579633929731
76,2022-05-01,421.08,113.3472721433651
77,2022-06-01,239.85,-69.90313894009185
78,2022-07-01,159.04,-152.72795474387462
79,2022-08-01,368.0,54.222438953882374
80,2022-09-01,333.41,17.628834831485733
81,2022-10-01,367.07000000000005,49.29160718422537
82,2022-11-01,312.0,-7.76900571756903
83,2022-12-01,401.0,79.24685405995746
84,2023-01-01,437.0,113.26910439669138
85,2023-02-01,231.20000000000002,-94.50294830011998
86,2023-03-01,472.0,144.3291283867676
87,2023-04-01,255.50000000000003,-74.13550393598578
88,2023-05-01,317.89000000000004,-13.708797312402055
89,2023-06-01,190.82,-142.7421317532939
90,2023-07-01,276.9,-58.62678149171688
91,2023-08-01,332.86,-4.632919355388992
92,2023-09-01,264.55,-74.91026580488727
93,2023-10-01,457.2,115.77149444704622
61 changes: 61 additions & 0 deletions outputs/text/png/png_price_graph.html

Large diffs are not rendered by default.

30 changes: 21 additions & 9 deletions src/google_trends.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,15 @@
from googleapiclient.errors import HttpError
# local import
from scripts.python.config import GoogleAPIkey
import logging

SERVICE_NAME = 'trends'
SERVICE_VERSION = 'v1beta'
_DISCOVERY_SERVICE_URL = 'https://www.googleapis.com/discovery/v1/apis/trends/v1beta/rest'


class GT:
def __init__(self, _GOOGLE_API_KEY = GoogleAPIkey):
def __init__(self, _GOOGLE_API_KEY=GoogleAPIkey):
self.service = build(
serviceName=SERVICE_NAME,
version=SERVICE_VERSION,
Expand Down Expand Up @@ -42,14 +43,12 @@ def get_health_trends(self, terms, timelineResolution="month"):
date.today() + timedelta(days=1), dtime.min)
raise RuntimeError('%s: blocked until %s' %
(reason, self.block_until))
import logging
logging.warning(http_error)
return []


def get_graph(self, terms,
restrictions_geo,
restrictions_startDate="2004-01"):
restrictions_geo,
restrictions_startDate="2004-01"):
graph = self.service.getGraph(
terms=terms,
restrictions_geo=restrictions_geo,
Expand All @@ -61,15 +60,28 @@ def get_graph(self, terms,
return response

except HttpError as http_error:
import logging
logging.warning(http_error)

return []

def get_top_topics(self, term,
restrictions_geo,
restrictions_startDate="2004-01"):
graph = self.service.getTopTopics(
term=term,
restrictions_geo=restrictions_geo,
restrictions_startDate=restrictions_startDate
)
try:
response = graph.execute()
return response
except Exception as e:
logging.warning(e)
return []

@staticmethod
def to_df(result:json) -> pd.DataFrame:
df = pd.json_normalize(result["lines"], meta=["term"], record_path=["points"])
def to_df(result: json) -> pd.DataFrame:
df = pd.json_normalize(result["lines"], meta=[
"term"], record_path=["points"])
if "date" in df.columns:
df["date"] = pd.to_datetime(df["date"])

Expand Down
138 changes: 99 additions & 39 deletions src/text/epu.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,43 +5,103 @@
is_in_word_list
)

ECON_LIST = [
"economy", "economic", "economics", "business", "commerce", "finance",
"industry"
]

def process_data(filepath: str) -> pd.DataFrame:
"""
Reads a CSV file and processes the data.
Args:
filename (str): The name of the CSV file.
folderpath (str): The path to the folder containing the CSV file.
Returns:
pd.DataFrame: Processed DataFrame with the "Unnamed: 0" column dropped,
newline characters removed from the "news" column,
"date" column converted to datetime, and a new "ym" column added.
"""
df = pd.read_csv(filepath).drop("Unnamed: 0", axis=1)
df["news"] = df["news"].replace("\n", "")
df["date"] = pd.to_datetime(df["date"])
df["ym"] = [str(d.year) + "-" + str(d.month) for d in df.date]
return df


def get_news_count(data: pd.DataFrame, column: str) -> pd.DataFrame:
"""
Computes the count of occurrences for a specific column in a DataFrame
grouped by the year-month ('ym').
Args:
data (pd.DataFrame): The input DataFrame.
column (str): The column for which the count is computed.
Returns:
pd.DataFrame: DataFrame with the count of occurrences for the specified column
"""
count_df = (data.set_index("date")
.groupby("ym")[[str(column)]]
.count()
.reset_index()
.rename({str(column): str(column) + "_count"}, axis=1))
return count_df
POLICY_LIST = [
"government", "governmental", "authorities", "minister", "ministry",
"parliament", "parliamentary", "tax", "regulation", "legislation",
"central bank", "cbsi", "imf", "world bank", "international monetary fund",
"debt"
]

UNCERTAINTY_LIST = [
"uncertain", "uncertainty", "uncertainties", "unknown", "unstable",
"unsure", "undetermined", "risky", "risk", "not certain", "non-reliable"
]


class EPU:
def __init__(self, filepath, econ_terms=ECON_LIST, policy_terms=POLICY_LIST, uncertainty_terms=UNCERTAINTY_LIST, **kwargs):
if not os.path.exists(filepath):
raise FileNotFoundError(f"Cannot find {filepath}")
self.filepath = filepath
self.econ_terms = econ_terms
self.policy_terms = policy_terms
self.uncertainty_terms = uncertainty_terms

@staticmethod
def process_data(filepath: str) -> pd.DataFrame:
"""
Reads a CSV file and processes the data.
Args:
filename (str): The name of the CSV file.
folderpath (str): The path to the folder containing the CSV file.
Returns:
pd.DataFrame: Processed DataFrame with the "Unnamed: 0" column dropped,
newline characters removed from the "news" column,
"date" column converted to datetime, and a new "ym" column added.
"""
df = pd.read_csv(filepath).drop("Unnamed: 0", axis=1)
df["news"] = df["news"].replace("\n", "")
df["date"] = pd.to_datetime(df["date"])
df["ym"] = [str(d.year) + "-" + str(d.month) for d in df.date]
return df

@staticmethod
def get_count(data: pd.DataFrame, column: str) -> pd.DataFrame:
"""
Computes the count of occurrences for a specific column in a DataFrame
grouped by the year-month ('ym').
Args:
data (pd.DataFrame): The input DataFrame.
column (str): The column for which the count is computed.
Returns:
pd.DataFrame: DataFrame with the count of occurrences for the specified column
"""
count_df = (data.set_index("date")
.groupby("ym")[[str(column)]]
.count()
.reset_index()
.rename({str(column): str(column) + "_count"}, axis=1))
return count_df

def get_epu_category(self):
self.raw = self.process_data(self.filepath)
for col, terms in zip(["econ", "policy", "uncertain"], [self.econ_terms, self.policy_terms, self.uncertainty_terms]):
self.raw[col] = self.raw["news"].str.lower().apply(
is_in_word_list, terms=terms)
self.raw["epu"] = (self.raw.econ == True) & (
self.raw.policy == True) & (self.raw.uncertain == True)

def get_epu_stats(self, cutoff: str = None) -> pd.DataFrame:
news_count = self.get_count(self.raw, "news")
epu_count = self.get_count(self.raw[self.raw["epu"] == True], "epu")
self.epu_stat = news_count.merge(
epu_count, how="left", on="ym").fillna(0)
self.epu_stat["date"] = pd.to_datetime(self.epu_stat["ym"])

# Check for date integrity
self.min_date, self.max_date = self.epu_stat.date.min(), self.epu_stat.date.max()
self.date_df = pd.DataFrame(pd.date_range(
self.min_date, self.max_date, freq="MS"), columns=["date"])

self.epu_stat = (self.date_df.merge(self.epu_stat, how="left", on="date")
.fillna(0).drop("ym", axis=1))
self.epu_stat["ratio"] = self.epu_stat["epu_count"] / \
self.epu_stat["news_count"]

if cutoff != None:
std = self.epu_stat[self.epu_stat.date <= cutoff]["ratio"].std()

std = self.epu_stat["ratio"].std()
self.epu_stat["z_score"] = self.epu_stat['ratio']/std

return self.epu_stat
Loading

0 comments on commit 91d5d09

Please sign in to comment.