From 019b3028ddc732f73f6c7c7ad0867a221764f04d Mon Sep 17 00:00:00 2001
From: Andrew McAllister <andrew.t.mcallister@gmail.com>
Date: Sun, 31 Jan 2021 09:31:56 +0100
Subject: [PATCH] Add output to readme example

---
 README.md                    |  21 ++-
 examples/kw_extraction.ipynb | 266 ++++++++++++++++++++++++++++-------
 2 files changed, 235 insertions(+), 52 deletions(-)
diff --git a/README.md b/README.md
index 45010e8..9b4c664 100644
--- a/README.md
+++ b/README.md
@@ -81,8 +81,8 @@ The following outlines using kwx to derive keywords from a text corpus with `pro
 from kwx.utils import prepare_data
 from kwx.model import extract_kws
 
-input_language = "english"
-num_keywords = 10
+input_language = "english" # see kwx.languages for options
+num_keywords = 15
 num_topics = 10
 ignore_words = ["words", "user", "knows", "they", "don't", "want"]
 
@@ -110,6 +110,23 @@ bert_kws = extract_kws(
 )
 ```
 
+```
+The BERT keywords are:
+
+['time', 'flight', 'plane', 'southwestair', 'ticket', 'cancel', 'united', 'baggage',
+'love', 'virginamerica', 'service', 'customer', 'delay', 'late', 'hour']
+
+Are there words that should be removed [y/n]? y
+Type or copy word(s) to be removed: southwestair, united, virginamerica
+
+The new BERT keywords are:
+
+['late', 'baggage', 'service', 'flight', 'time', 'love', 'book', 'customer',
+'response', 'hold', 'hour', 'cancel', 'cancelled_flighted', 'delay', 'plane']
+
+Are there words that should be removed [y/n]? n
+```
+
 The model will be re-ran until all words known to be unreasonable are removed for a suitable output. `kwx.model.gen_files` could also be used as a run-all function that produces a directory with a keyword text file and visuals (for experienced users wanting quick results).
 
 # Visuals
diff --git a/examples/kw_extraction.ipynb b/examples/kw_extraction.ipynb
index 8746c41..f9def64 100644
--- a/examples/kw_extraction.ipynb
+++ b/examples/kw_extraction.ipynb
@@ -26,11 +26,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 1,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2021-01-29T19:27:58.045088Z",
-     "start_time": "2021-01-29T19:27:56.856283Z"
+     "end_time": "2021-01-31T07:59:40.153415Z",
+     "start_time": "2021-01-31T07:59:33.083450Z"
     },
     "slideshow": {
      "slide_type": "skip"
@@ -83,24 +83,75 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 2,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2021-01-29T19:27:59.465869Z",
-     "start_time": "2021-01-29T19:27:59.369975Z"
+     "end_time": "2021-01-31T07:59:43.242510Z",
+     "start_time": "2021-01-31T07:59:43.169963Z"
     }
    },
    "outputs": [
     {
-     "ename": "AttributeError",
-     "evalue": "module 'kwgen' has no attribute 'utils'",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
-      "\u001b[0;32m<ipython-input-3-65bd2af6110a>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdf_corpus\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mkwgen\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mutils\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'data/Tweets.csv'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      2\u001b[0m \u001b[0mdf_corpus\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mloc\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;36m5\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;31mAttributeError\u001b[0m: module 'kwgen' has no attribute 'utils'"
-     ]
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>text</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>15</th>\n",
+       "      <td>@VirginAmerica SFO-PDX schedule is still MIA.</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>16</th>\n",
+       "      <td>@VirginAmerica So excited for my first cross c...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>17</th>\n",
+       "      <td>@VirginAmerica  I flew from NYC to SFO last we...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>18</th>\n",
+       "      <td>I ❤️ flying @VirginAmerica. ☺️👍</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>19</th>\n",
+       "      <td>@VirginAmerica you know what would be amazingl...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                 text\n",
+       "15      @VirginAmerica SFO-PDX schedule is still MIA.\n",
+       "16  @VirginAmerica So excited for my first cross c...\n",
+       "17  @VirginAmerica  I flew from NYC to SFO last we...\n",
+       "18                    I ❤️ flying @VirginAmerica. ☺️👍\n",
+       "19  @VirginAmerica you know what would be amazingl..."
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
     }
    ],
    "source": [
@@ -117,11 +168,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 3,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2021-01-30T16:02:53.717597Z",
-     "start_time": "2021-01-30T16:02:53.715299Z"
+     "end_time": "2021-01-31T07:59:46.560388Z",
+     "start_time": "2021-01-31T07:59:46.557648Z"
     }
    },
    "outputs": [],
@@ -131,14 +182,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
+   "execution_count": 5,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2021-01-31T08:01:14.767287Z",
+     "start_time": "2021-01-31T07:59:55.997765Z"
+    }
+   },
    "outputs": [],
    "source": [
     "# The [0] gives us the corpus\n",
     "# [1] is clean strings for BERT\n",
     "# [2] the indexes of selected entries if sample_size != 1\n",
-    "text_corpus = prepare_text_data(\n",
+    "text_corpus = prepare_data(\n",
     "    data=df_airline_tweets,\n",
     "    target_cols='text',\n",
     "    input_language=input_language, \n",
@@ -150,9 +206,38 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "execution_count": 6,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2021-01-31T08:01:16.802475Z",
+     "start_time": "2021-01-31T08:01:16.798131Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[['virginamerica', 'schedule'],\n",
+       " ['virgin_america',\n",
+       "  'cross_country',\n",
+       "  'virginamerica',\n",
+       "  'excited',\n",
+       "  'cross',\n",
+       "  'country',\n",
+       "  'flight',\n",
+       "  'hear',\n",
+       "  'virgin',\n",
+       "  'america'],\n",
+       " ['virginamerica', 'week', 'seat', 'gentleman'],\n",
+       " ['virginamerica'],\n",
+       " ['virginamerica', 'amazingly', 'awesome']]"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "text_corpus[15:20]"
    ]
@@ -169,8 +254,23 @@
    "execution_count": 11,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2020-11-09T19:02:40.001462Z",
-     "start_time": "2020-11-09T19:02:31.106061Z"
+     "end_time": "2021-01-31T08:25:37.421746Z",
+     "start_time": "2021-01-31T08:25:37.419367Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "num_keywords = 15\n",
+    "num_topics = 10"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2021-01-31T08:03:30.958104Z",
+     "start_time": "2021-01-31T08:01:36.750490Z"
     },
     "slideshow": {
      "slide_type": "slide"
@@ -185,8 +285,8 @@
     "    clean_texts=None,\n",
     "    input_language=input_language,\n",
     "    output_language=None,\n",
-    "    num_keywords=15,\n",
-    "    num_topics=10,\n",
+    "    num_keywords=num_keywords,\n",
+    "    num_topics=num_topics,\n",
     "    corpuses_to_compare=None,\n",
     "    return_topics=True,\n",
     "    ignore_words=None,\n",
@@ -198,9 +298,39 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "execution_count": 8,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2021-01-31T08:03:58.445759Z",
+     "start_time": "2021-01-31T08:03:58.442117Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['hour',\n",
+       " 'usairway',\n",
+       " 'hold',\n",
+       " 'southwestair',\n",
+       " 'flight',\n",
+       " 'wait',\n",
+       " 'united',\n",
+       " 'americanair',\n",
+       " 'minute',\n",
+       " 'time',\n",
+       " 'luggage',\n",
+       " 'phone',\n",
+       " 'follow',\n",
+       " 'answer',\n",
+       " 'baggage']"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "topics[0]"
    ]
@@ -214,11 +344,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 9,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2020-11-09T20:23:19.788818Z",
-     "start_time": "2020-11-09T20:23:19.786309Z"
+     "end_time": "2021-01-31T08:04:11.372380Z",
+     "start_time": "2021-01-31T08:04:11.369974Z"
     },
     "slideshow": {
      "slide_type": "slide"
@@ -227,7 +357,7 @@
    "outputs": [],
    "source": [
     "# The following is a string or list of strings to not include in outputs\n",
-    "# This variable is updated in the keyword selection step of gen_files\n",
+    "# This variable is updated by the user if prompt_remove_words=True\n",
     "ignore_words = None"
    ]
   },
@@ -252,8 +382,8 @@
     "    clean_texts=None,\n",
     "    input_language=input_language,\n",
     "    output_language=None,\n",
-    "    num_keywords=15,\n",
-    "    num_topics=10,\n",
+    "    num_keywords=num_keywords,\n",
+    "    num_topics=num_topics,\n",
     "    corpuses_to_compare=None,\n",
     "    return_topics=False,\n",
     "    ignore_words=None,\n",
@@ -275,17 +405,53 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 10,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2020-11-09T19:02:53.171585Z",
-     "start_time": "2020-11-09T19:02:44.447035Z"
+     "end_time": "2021-01-31T08:19:14.945990Z",
+     "start_time": "2021-01-31T08:04:25.526000Z"
     },
     "slideshow": {
      "slide_type": "slide"
     }
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The LDA keywords are:\n",
+      "\n",
+      "['jetblue', 'united', 'flight', 'service', 'airline', 'southwestair', 'time', 'baggage', 'crew', 'attendant', 'cancel', 'americanair', 'hour', 'usairway', 'delay']\n",
+      "\n",
+      "Are there words that should be removed [y/n]? y\n",
+      "Type or copy word(s) to be removed: jetblue, united, americanair, usairway\n",
+      "\n",
+      "\n",
+      "The new LDA keywords are:\n",
+      "\n",
+      "['book', 'flight', 'southwestair', 'seat', 'service', 'plane', 'hour', 'hold', 'cancel', 'cancelled_flighted', 'delay', 'late', 'lose', 'baggage', 'airline']\n",
+      "\n",
+      "Are there words that should be removed [y/n]? y\n",
+      "Type or copy word(s) to be removed: southwestair\n",
+      "\n",
+      "\n",
+      "The new LDA keywords are:\n",
+      "\n",
+      "['time', 'flight', 'plane', 'fleek', 'ticket', 'cancel', 'usairways', 'baggage', 'love', 'virginamerica', 'service', 'customer', 'delay', 'late', 'hour']\n",
+      "\n",
+      "Are there words that should be removed [y/n]? y\n",
+      "Type or copy word(s) to be removed: virginamerica, fleek, usairways\n",
+      "\n",
+      "\n",
+      "The new LDA keywords are:\n",
+      "\n",
+      "['late', 'fleet', 'service', 'flight', 'time', 'love', 'book', 'customer', 'response', 'hold', 'hour', 'cancel', 'cancelled_flighted', 'delay', 'plane']\n",
+      "\n",
+      "Are there words that should be removed [y/n]? n\n"
+     ]
+    }
+   ],
    "source": [
     "lda_kws = extract_kws(\n",
     "    method='LDA',\n",
@@ -293,8 +459,8 @@
     "    clean_texts=None,\n",
     "    input_language=input_language,\n",
     "    output_language=None,\n",
-    "    num_keywords=15,\n",
-    "    num_topics=10,\n",
+    "    num_keywords=num_keywords,\n",
+    "    num_topics=num_topics,\n",
     "    corpuses_to_compare=None,\n",
     "    return_topics=False,\n",
     "    ignore_words=None,\n",
@@ -377,7 +543,7 @@
     "# The [0] gives us the corpus\n",
     "# [1] is clean strings for BERT\n",
     "# [2] the indexes of selected entries if sample_size != 1\n",
-    "united_corpus = prepare_text_data(\n",
+    "united_corpus = prepare_data(\n",
     "    data=df_united,\n",
     "    target_cols='text',\n",
     "    input_language=input_language, \n",
@@ -407,7 +573,7 @@
     "# The [0] gives us the corpus\n",
     "# [1] is clean strings for BERT\n",
     "# [2] the indexes of selected entries if sample_size != 1\n",
-    "other_airlines_corpus = prepare_text_data(\n",
+    "other_airlines_corpus = prepare_data(\n",
     "    data=df_other_airlines,\n",
     "    target_cols='text',\n",
     "    input_language=input_language, \n",
@@ -476,7 +642,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "topic_nums_to_compare = list(range(5,15))"
+    "topic_nums_to_compare = list(range(5, 16))"
    ]
   },
   {
@@ -490,7 +656,7 @@
     "    text_corpus=text_corpus, \n",
     "    clean_texts=None,\n",
     "    input_language=input_language,\n",
-    "    num_keywords=10,\n",
+    "    num_keywords=num_keywords,\n",
     "    topic_nums_to_compare=topic_nums_to_compare,\n",
     "    min_freq=2,\n",
     "    min_word_len=4,\n",
@@ -528,7 +694,7 @@
     "#     method='lda',\n",
     "#     text_corpus=text_corpus, \n",
     "#     input_language=input_language,\n",
-    "#     num_topics=10,\n",
+    "#     num_topics=num_topics,\n",
     "#     min_freq=2,\n",
     "#     min_word_len=4,\n",
     "#     save_file=False,\n",
@@ -586,7 +752,7 @@
     "t_sne(\n",
     "    dimension=\"both\", \n",
     "    text_corpus=text_corpus, \n",
-    "    num_topics=10, \n",
+    "    num_topics=num_topics, \n",
     "    remove_3d_outliers=True,\n",
     "    fig_size=(20, 10),\n",
     "    save_file=False,\n",
@@ -633,7 +799,7 @@
     "    clean_texts=None,\n",
     "    input_language=input_language,\n",
     "    output_language=None,\n",
-    "    num_keywords=15,\n",
+    "    num_keywords=num_keywords,\n",
     "    topic_nums_to_compare=topic_nums_to_compare,\n",
     "    corpuses_to_compare=None,\n",
     "    ignore_words=None,\n",

	text
15	@VirginAmerica SFO-PDX schedule is still MIA.
16	@VirginAmerica So excited for my first cross c...
17	@VirginAmerica I flew from NYC to SFO last we...
18	I ❤️ flying @VirginAmerica. ☺️👍
19	@VirginAmerica you know what would be amazingl...