From 102f0567a203392990cbfc90e2046b8f40ff635c Mon Sep 17 00:00:00 2001 From: Charlie Zhang Date: Wed, 1 Nov 2023 11:06:05 -0400 Subject: [PATCH] update text assets. --- .../text/text_presentation.slides.html | 7925 +++++++++++++++++ docs/text/intro.md | 4 +- notebooks/text/text_classification.ipynb | 222 +- notebooks/text/word_embeddings.ipynb | 580 ++ 4 files changed, 8709 insertions(+), 22 deletions(-) create mode 100644 docs/images/interactive/text/text_presentation.slides.html create mode 100644 notebooks/text/word_embeddings.ipynb diff --git a/docs/images/interactive/text/text_presentation.slides.html b/docs/images/interactive/text/text_presentation.slides.html new file mode 100644 index 00000000..8aec62b8 --- /dev/null +++ b/docs/images/interactive/text/text_presentation.slides.html @@ -0,0 +1,7925 @@ + + + + + + + +text_presentation slides + + + + + + + + + + + + + + + + + +
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ + + diff --git a/docs/text/intro.md b/docs/text/intro.md index fdb25f78..f40592f6 100644 --- a/docs/text/intro.md +++ b/docs/text/intro.md @@ -1,5 +1,7 @@ # Introduction +An interactive presentation is aviliable at [here](https://worldbank.github.io/pacific-observatory/interactive/text/text_presentation.slides.html). + ## Economic Policy Uncertainty (EPU) Index Baker, Bloom, and Davis (2016) developed Economic Policy Uncertainty Index by exploiting the mention of economic, policy, and uncertainty group of terms on newspapers, and found that "innovations in policy uncertainty foreshadow declines in investment, output, and employment in the United States and … other 12 major economies."​ @@ -29,4 +31,4 @@ Sentiment analysis has been frequently employed in analyzing financial markets a ![Overview](../images/text/sib_econ_sentiment.png) Notes: Newspapers include Solomon Stars, Solomon Times, The Island Sun, Solomon Islands Broadcasting Corporation, ABC AU, and RNZ. ``` -```` +```` \ No newline at end of file diff --git a/notebooks/text/text_classification.ipynb b/notebooks/text/text_classification.ipynb index 3b1cd7f3..50a7d394 100644 --- a/notebooks/text/text_classification.ipynb +++ b/notebooks/text/text_classification.ipynb @@ -29,12 +29,13 @@ "from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix\n", "from sklearn.metrics import roc_curve, auc, roc_auc_score\n", "\n", - "from imblearn.over_sampling import SMOTE" + "from imblearn.over_sampling import SMOTE\n", + "import tiktoken" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "id": "60b1ecc2", "metadata": {}, "outputs": [], @@ -45,7 +46,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "id": "26c8eb34", "metadata": {}, "outputs": [], @@ -61,7 +62,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "id": "3d73c1e3", "metadata": {}, "outputs": [], @@ -74,7 +75,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "id": "32b07a50", "metadata": {}, "outputs": [], @@ -93,7 +94,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "id": "93a68f44", "metadata": {}, "outputs": [], @@ -103,7 +104,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "id": "628af224", "metadata": {}, "outputs": [], @@ -118,7 +119,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "id": "1465f259", "metadata": {}, "outputs": [], @@ -128,7 +129,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "id": "1d37d67e", "metadata": {}, "outputs": [ @@ -157,7 +158,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, "id": "019004c5", "metadata": {}, "outputs": [], @@ -166,6 +167,16 @@ "news_df[\"texts\"] = news_df[\"texts\"].apply(lambda x: \" \".join(i for i in x))" ] }, + { + "cell_type": "code", + "execution_count": 21, + "id": "903c0680", + "metadata": {}, + "outputs": [], + "source": [ + "text_embeddings = [nlp(text).vector for text in news_df[\"texts\"]] " + ] + }, { "cell_type": "code", "execution_count": 11, @@ -366,7 +377,7 @@ }, { "cell_type": "markdown", - "id": "9ad09d10", + "id": "2c2ff213", "metadata": {}, "source": [ "## PNG" @@ -528,7 +539,7 @@ }, { "cell_type": "markdown", - "id": "62658bdd", + "id": "239c6876", "metadata": {}, "source": [ "## SIB" @@ -537,7 +548,7 @@ { "cell_type": "code", "execution_count": 29, - "id": "26fc1f1a", + "id": "f06e3ffe", "metadata": {}, "outputs": [], "source": [ @@ -553,7 +564,7 @@ { "cell_type": "code", "execution_count": 33, - "id": "4b7d0190", + "id": "0f607747", "metadata": {}, "outputs": [], "source": [ @@ -571,7 +582,7 @@ { "cell_type": "code", "execution_count": 34, - "id": "52a9b5bd", + "id": "d120f397", "metadata": {}, "outputs": [ { @@ -599,7 +610,7 @@ { "cell_type": "code", "execution_count": 35, - "id": "53c1e3f0", + "id": "627457ec", "metadata": {}, "outputs": [], "source": [ @@ -611,7 +622,7 @@ { "cell_type": "code", "execution_count": 36, - "id": "aef30fe5", + "id": "37a1b8a9", "metadata": {}, "outputs": [], "source": [ @@ -622,7 +633,7 @@ { "cell_type": "code", "execution_count": 37, - "id": "5eb5b059", + "id": "f21c1b57", "metadata": {}, "outputs": [], "source": [ @@ -633,7 +644,7 @@ { "cell_type": "code", "execution_count": 38, - "id": "f6cbf725", + "id": "78383223", "metadata": {}, "outputs": [], "source": [ @@ -650,7 +661,7 @@ { "cell_type": "code", "execution_count": 39, - "id": "2b9bd362", + "id": "d21342dd", "metadata": {}, "outputs": [], "source": [ @@ -660,10 +671,179 @@ "sib_econ_sent = sib_econ.set_index(\"date\").groupby(pd.Grouper(freq=\"MS\"))[[\"sentiment\"]].mean().reset_index()" ] }, + { + "cell_type": "code", + "execution_count": 46, + "id": "20ecbb22", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
datesentiment
912011-01-010.878500
922011-02-010.933483
932011-03-010.953929
942011-04-010.849688
952011-05-010.893327
962011-06-010.761678
972011-07-010.742758
982011-08-010.960333
992011-09-010.816700
1002011-10-010.817550
1012011-11-010.834325
1022011-12-010.948283
1032012-01-01-0.381800
1042012-02-010.689171
1052012-03-010.981733
1062012-04-010.369088
1072012-05-010.670069
1082012-06-010.659550
1092012-07-010.655968
1102012-08-010.484206
\n", + "
" + ], + "text/plain": [ + " date sentiment\n", + "91 2011-01-01 0.878500\n", + "92 2011-02-01 0.933483\n", + "93 2011-03-01 0.953929\n", + "94 2011-04-01 0.849688\n", + "95 2011-05-01 0.893327\n", + "96 2011-06-01 0.761678\n", + "97 2011-07-01 0.742758\n", + "98 2011-08-01 0.960333\n", + "99 2011-09-01 0.816700\n", + "100 2011-10-01 0.817550\n", + "101 2011-11-01 0.834325\n", + "102 2011-12-01 0.948283\n", + "103 2012-01-01 -0.381800\n", + "104 2012-02-01 0.689171\n", + "105 2012-03-01 0.981733\n", + "106 2012-04-01 0.369088\n", + "107 2012-05-01 0.670069\n", + "108 2012-06-01 0.659550\n", + "109 2012-07-01 0.655968\n", + "110 2012-08-01 0.484206" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sib_econ_sent[sib_econ_sent.date >= \"2011-01-01\"].head(20)" + ] + }, { "cell_type": "code", "execution_count": 42, - "id": "26d40d63", + "id": "2d75d3da", "metadata": {}, "outputs": [ { diff --git a/notebooks/text/word_embeddings.ipynb b/notebooks/text/word_embeddings.ipynb new file mode 100644 index 00000000..3d07d44a --- /dev/null +++ b/notebooks/text/word_embeddings.ipynb @@ -0,0 +1,580 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "f7e0d625", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "os.chdir(\"../../\")\n", + "import pandas as pd\n", + "\n", + "import spacy\n", + "from nltk.corpus import stopwords\n", + "from src.text.epu import *\n", + "from src.text.utils import *\n", + "\n", + "from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer\n", + "from sklearn.pipeline import Pipeline\n", + "from sklearn.model_selection import train_test_split, KFold, cross_validate, GridSearchCV\n", + "\n", + "from sklearn.naive_bayes import MultinomialNB\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.svm import LinearSVC\n", + "from xgboost import XGBClassifier\n", + "\n", + "from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix\n", + "from sklearn.metrics import roc_curve, auc, roc_auc_score\n", + "\n", + "import tiktoken" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "62978b00", + "metadata": {}, + "outputs": [], + "source": [ + "folder = os.getcwd() + \"/data/text/abc_au/\"\n", + "file_lists = [folder + file for file in os.listdir(folder) if \"news\" in file]\n", + "\n", + "news_df = pd.DataFrame()\n", + "for file in file_lists:\n", + " df = pd.read_csv(file).drop(\"Unnamed: 0\", axis=1)\n", + " if news_df.empty:\n", + " news_df = df\n", + " else:\n", + " news_df = pd.concat([news_df, df], axis=0).reset_index(drop=True)\n", + " \n", + "news_df = news_df.dropna().reset_index(drop=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "c6d2d1f7", + "metadata": {}, + "outputs": [], + "source": [ + "econ_list = [\n", + " 'economy', 'economic', 'economics', 'business', 'commerce', 'finance',\n", + " 'industry', \"assistance\", \"science and technology\", \"trade\", \"mining and metals industry\"\n", + "]\n", + "news_df[\"econ\"] = news_df[\"tags\"].str.lower().apply(is_in_word_list, terms=econ_list)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "354823d7", + "metadata": {}, + "outputs": [], + "source": [ + "nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])\n", + "spacy_stopwords = list(nlp.Defaults.stop_words)\n", + "\n", + "nltk_stopwords = stopwords.words(\"english\")\n", + "nltk_unique = [sw for sw in nltk_stopwords if sw not in spacy_stopwords]\n", + "spacy_stopwords.extend(nltk_unique)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "9e6ef2ad", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Stopwords has been done.\n" + ] + } + ], + "source": [ + "from src.text.utils import *\n", + "data = news_df.news.values.tolist()\n", + "data_words = list(sent_to_words(data))\n", + "\n", + "bigram = gensim.models.Phrases(data_words, min_count=20, threshold=100)\n", + "bigram_mod = gensim.models.phrases.Phraser(bigram)\n", + "\n", + "trigram = gensim.models.Phrases(bigram[data_words], min_count=20, threshold=100)\n", + "trigram_mod = gensim.models.phrases.Phraser(trigram)\n", + "\n", + "texts_preprocessed = preprocess_text(data_words, spacy_stopwords, bigram_mod,\n", + " trigram_mod, nlp)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "798a3dfd", + "metadata": {}, + "outputs": [], + "source": [ + "news_df[\"texts\"] = texts_preprocessed\n", + "news_df[\"texts\"] = news_df[\"texts\"].apply(lambda x: \" \".join(i for i in x))" + ] + }, + { + "cell_type": "markdown", + "id": "4e632757", + "metadata": {}, + "source": [ + "## Glove" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "984c86d4", + "metadata": {}, + "outputs": [], + "source": [ + "import gensim.downloader\n", + "# Download the \"glove-twitter-25\" embeddings\n", + "glove_vectors = gensim.downloader.load('word2vec-google-news-300')" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "42cc393e", + "metadata": {}, + "outputs": [], + "source": [ + "def get_average_word_vectors(text, model):\n", + " words = text.split()\n", + " vectors = [model[word] for word in words if word in model]\n", + " if vectors:\n", + " return np.mean(vectors, axis=0)\n", + " else:\n", + " return np.zeros(model.vector_size)\n", + "\n", + "text_embeddings = [get_average_word_vectors(text, glove_vectors) for text in news_df[\"texts\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "id": "c956582a", + "metadata": {}, + "outputs": [], + "source": [ + "X_train, X_test, y_train, y_test = train_test_split(\n", + " text_embeddings, \n", + " news_df['econ'].astype(int), \n", + " test_size=0.1, \n", + " random_state=42, \n", + " shuffle=True) " + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "78f7c8fb", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.94 0.98 0.96 1164\n", + " 1 0.88 0.66 0.75 208\n", + "\n", + " accuracy 0.93 1372\n", + " macro avg 0.91 0.82 0.86 1372\n", + "weighted avg 0.93 0.93 0.93 1372\n", + "\n" + ] + } + ], + "source": [ + "clf = XGBClassifier(tree_method=\"hist\", enable_categorical=True, random_state=42)\n", + "clf.fit(X_train, y_train)\n", + "y_predict = clf.predict(X_test)\n", + "print(classification_report(y_test, y_predict))" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "89304d48", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "RandomForestClassifier: Mean Accuracy = 86.58%; Mean F1-macro = 58.19%; Mean recall-macro = 13.85%; Mean precision-macro = 83.43%\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/czhang/miniforge3/envs/po/lib/python3.9/site-packages/sklearn/svm/_classes.py:32: FutureWarning: The default value of `dual` will change from `True` to `'auto'` in 1.5. Set the value of `dual` explicitly to suppress the warning.\n", + " warnings.warn(\n", + "/Users/czhang/miniforge3/envs/po/lib/python3.9/site-packages/sklearn/svm/_classes.py:32: FutureWarning: The default value of `dual` will change from `True` to `'auto'` in 1.5. Set the value of `dual` explicitly to suppress the warning.\n", + " warnings.warn(\n", + "/Users/czhang/miniforge3/envs/po/lib/python3.9/site-packages/sklearn/svm/_classes.py:32: FutureWarning: The default value of `dual` will change from `True` to `'auto'` in 1.5. Set the value of `dual` explicitly to suppress the warning.\n", + " warnings.warn(\n", + "/Users/czhang/miniforge3/envs/po/lib/python3.9/site-packages/sklearn/svm/_classes.py:32: FutureWarning: The default value of `dual` will change from `True` to `'auto'` in 1.5. Set the value of `dual` explicitly to suppress the warning.\n", + " warnings.warn(\n", + "/Users/czhang/miniforge3/envs/po/lib/python3.9/site-packages/sklearn/svm/_classes.py:32: FutureWarning: The default value of `dual` will change from `True` to `'auto'` in 1.5. Set the value of `dual` explicitly to suppress the warning.\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "LinearSVC: Mean Accuracy = 88.79%; Mean F1-macro = 73.16%; Mean recall-macro = 41.38%; Mean precision-macro = 72.53%\n", + "LogisticRegression: Mean Accuracy = 88.27%; Mean F1-macro = 70.81%; Mean recall-macro = 36.23%; Mean precision-macro = 72.23%\n", + "XGBClassifier: Mean Accuracy = 92.60%; Mean F1-macro = 84.00%; Mean recall-macro = 63.87%; Mean precision-macro = 83.24%\n" + ] + } + ], + "source": [ + "models = [\n", + " RandomForestClassifier(n_estimators=1000, max_depth=5, random_state=42),\n", + " LinearSVC(random_state=42),\n", + " LogisticRegression(solver='liblinear', C=1, penalty='l2', random_state=42),\n", + " XGBClassifier(tree_method=\"hist\", enable_categorical=True, random_state=42)\n", + "]\n", + "\n", + "scoring = ['accuracy', 'f1_macro', 'recall', 'precision']\n", + "\n", + "for model in models:\n", + " model_name = model.__class__.__name__\n", + " result = cross_validate(model,\n", + " X_train,\n", + " y_train,\n", + " cv=5,\n", + " scoring=scoring)\n", + " print(\n", + " \"%s: Mean Accuracy = %.2f%%; Mean F1-macro = %.2f%%; Mean recall-macro = %.2f%%; Mean precision-macro = %.2f%%\"\n", + " % (model_name, result['test_accuracy'].mean() * 100,\n", + " result['test_f1_macro'].mean() * 100, result['test_recall'].mean() *\n", + " 100, result['test_precision'].mean() * 100))" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "23593f88", + "metadata": {}, + "outputs": [], + "source": [ + "sib_files = [\n", + " os.getcwd() + \"/data/text/rnz/solomon_islands_rnz_news.csv\",\n", + " os.getcwd() + \"/data/text/abc_au/solomon_islands_abc_news.csv\",\n", + "]\n", + "\n", + "sib_folder = os.getcwd() + \"/data/text/solomon_islands/\"\n", + "sib_files.extend([sib_folder+file for file in os.listdir(sib_folder) if \"news\" in file])" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "b91e72ee", + "metadata": {}, + "outputs": [], + "source": [ + "sib = pd.DataFrame()\n", + "for file in sib_files:\n", + " temp = pd.read_csv(file).drop(\"Unnamed: 0\", axis=1)\n", + " if sib.empty:\n", + " sib = temp\n", + " else:\n", + " sib = pd.concat([sib, temp], axis=0).reset_index(drop=True)\n", + "\n", + "sib = sib[sib.news.isna() != True].reset_index(drop=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "966d163b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Stopwords has been done.\n" + ] + } + ], + "source": [ + "sib_data = sib.news.values.tolist()\n", + "sib_data_words = list(sent_to_words(sib_data))\n", + "\n", + "bigram = gensim.models.Phrases(sib_data_words, min_count=20, threshold=100)\n", + "bigram_mod = gensim.models.phrases.Phraser(bigram)\n", + "\n", + "trigram = gensim.models.Phrases(bigram[sib_data_words], min_count=20, threshold=100)\n", + "trigram_mod = gensim.models.phrases.Phraser(trigram)\n", + "\n", + "sib_texts = preprocess_text(sib_data_words, spacy_stopwords, bigram_mod,\n", + " trigram_mod, nlp)" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "7c09981e", + "metadata": {}, + "outputs": [], + "source": [ + "sib[\"date\"] = pd.to_datetime(sib[\"date\"], format='mixed')\n", + "sib[\"texts\"] = sib_texts\n", + "sib[\"texts\"] = sib[\"texts\"].apply(lambda x: \" \".join(i for i in x))" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "8d85380f", + "metadata": {}, + "outputs": [], + "source": [ + "sib_pred = clf.predict([get_average_word_vectors(text, glove_vectors) for text in sib[\"texts\"]])" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "f9927dad", + "metadata": {}, + "outputs": [], + "source": [ + "sib[\"econ_pred\"] = sib_pred\n", + "sib_econ = sib[sib.econ_pred == 1].reset_index(drop=True)\n", + "\n", + "from nltk.sentiment.vader import SentimentIntensityAnalyzer\n", + "sent_df = pd.DataFrame()\n", + "for row in sib_econ.news:\n", + " sample = row.replace(\"\\n\", \" \").strip()\n", + " sid = SentimentIntensityAnalyzer()\n", + " ss = sid.polarity_scores(sample)\n", + " ss_temp = pd.DataFrame(ss, columns=ss.keys(), index=[0])\n", + " sent_df = pd.concat([sent_df, ss_temp], axis=0)\n", + " \n", + "sib_econ[\"sentiment\"] = sent_df[\"compound\"].tolist()\n", + "sib_econ[\"date\"] = [d.date() for d in sib_econ[\"date\"]]\n", + "sib_econ[\"date\"] = [pd.to_datetime(d) for d in sib_econ[\"date\"]]\n", + "sib_econ_sent = sib_econ.set_index(\"date\").groupby(pd.Grouper(freq=\"MS\"))[[\"sentiment\"]].mean().reset_index()" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "414cd674", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import matplotlib.pyplot as plt\n", + "fig, ax = plt.subplots(figsize=(10, 6))\n", + "sib_econ_sent.plot(x=\"date\", ax=ax);\n", + "plt.axhline(y=0, color=\"red\", linestyle=\"--\")\n", + "plt.xlabel(\"Date\")\n", + "plt.title(\"Economic-Related Sentiment in Solomon Islands\")\n", + "plt.tight_layout()" + ] + }, + { + "cell_type": "markdown", + "id": "dcda0ef6", + "metadata": {}, + "source": [ + "## Embedding Layer + CNN" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "id": "c529476d", + "metadata": {}, + "outputs": [], + "source": [ + "from tensorflow.keras.preprocessing.text import Tokenizer\n", + "from tensorflow.keras.preprocessing.sequence import pad_sequences\n", + "from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Flatten, Dense\n", + "from tensorflow.keras.models import Sequential\n", + "import string\n", + "from nltk.tokenize import word_tokenize" + ] + }, + { + "cell_type": "code", + "execution_count": 120, + "id": "6d8323b5", + "metadata": {}, + "outputs": [], + "source": [ + "X_train, X_test, y_train, y_test = train_test_split(\n", + " news_df[\"news\"], \n", + " news_df['econ'].astype(int), \n", + " test_size=0.1, \n", + " random_state=42, \n", + " shuffle=True) " + ] + }, + { + "cell_type": "code", + "execution_count": 121, + "id": "46b705e1", + "metadata": {}, + "outputs": [], + "source": [ + "def preprocess(text):\n", + " text = text.lower()\n", + " text = ''.join([word for word in text if word not in string.punctuation])\n", + " tokens = word_tokenize(text)\n", + " tokens = [word for word in tokens if word not in spacy_stopwords]\n", + " return ' '.join(tokens)\n", + "\n", + "X_train = X_train.apply(preprocess)\n", + "X_test = X_test.apply(preprocess)" + ] + }, + { + "cell_type": "code", + "execution_count": 122, + "id": "c52523f2", + "metadata": {}, + "outputs": [ + { + "ename": "KeyboardInterrupt", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[122], line 18\u001b[0m\n\u001b[1;32m 16\u001b[0m \u001b[38;5;66;03m# Create a weight matrix for the embedding layer\u001b[39;00m\n\u001b[1;32m 17\u001b[0m embedding_matrix \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39mzeros((vocab_size, \u001b[38;5;241m300\u001b[39m))\n\u001b[0;32m---> 18\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m word, i \u001b[38;5;129;01min\u001b[39;00m tokenizer\u001b[38;5;241m.\u001b[39mword_index\u001b[38;5;241m.\u001b[39mitems():\n\u001b[1;32m 19\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m word \u001b[38;5;129;01min\u001b[39;00m glove_vectors\u001b[38;5;241m.\u001b[39mindex_to_key:\n\u001b[1;32m 20\u001b[0m embedding_matrix[i] \u001b[38;5;241m=\u001b[39m glove_vectors[word]\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m: " + ] + } + ], + "source": [ + "# Tokenize the text data\n", + "tokenizer = Tokenizer()\n", + "tokenizer.fit_on_texts(X_train)\n", + "\n", + "X_train = tokenizer.texts_to_sequences(X_train)\n", + "X_test = tokenizer.texts_to_sequences(X_test)\n", + "\n", + "vocab_size = len(tokenizer.word_index) + 1\n", + "\n", + "# Pad the sequences to a fixed length\n", + "max_length = 300\n", + "X_train = pad_sequences(X_train, maxlen=max_length, padding='post')\n", + "X_test = pad_sequences(X_test, maxlen=max_length, padding='post')\n", + "\n", + "\n", + "# Create a weight matrix for the embedding layer\n", + "embedding_matrix = np.zeros((vocab_size, 300))\n", + "for word, i in tokenizer.word_index.items():\n", + " if word in glove_vectors.index_to_key:\n", + " embedding_matrix[i] = glove_vectors[word]\n", + "\n", + "# Define the CNN model\n", + "model = Sequential()\n", + "model.add(Embedding(vocab_size, 300, weights=[embedding_matrix], input_length=max_length, trainable=False))\n", + "model.add(Conv1D(128, 5, activation='relu'))\n", + "model.add(MaxPooling1D(5))\n", + "model.add(Conv1D(128, 5, activation='relu'))\n", + "model.add(MaxPooling1D(5))\n", + "model.add(Flatten())\n", + "model.add(Dense(128, activation='relu'))\n", + "model.add(Dense(1, activation='sigmoid'))\n", + "\n", + "model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])\n", + "model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))" + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "id": "2eb19e13", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "43/43 [==============================] - 0s 3ms/step - loss: 0.6626 - accuracy: 0.9155\n" + ] + }, + { + "data": { + "text/plain": [ + "[0.6626473665237427, 0.9154518842697144]" + ] + }, + "execution_count": 93, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.evaluate(X_test, y_test)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "po", + "language": "python", + "name": "po" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.0" + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": true, + "sideBar": true, + "skip_h1_title": false, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": {}, + "toc_section_display": true, + "toc_window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}