From d763e82b8489c12266b4debb2ee419f4e463881b Mon Sep 17 00:00:00 2001
From: Andrew McAllister <andrew.t.mcallister@gmail.com>
Date: Sun, 31 Jan 2021 14:22:02 +0100
Subject: [PATCH] Add example notebook

---
 examples/kw_extraction.ipynb | 1004 ++++++++++++++++++++++++++++++++++
 1 file changed, 1004 insertions(+)
 create mode 100644 examples/kw_extraction.ipynb
diff --git a/examples/kw_extraction.ipynb b/examples/kw_extraction.ipynb
new file mode 100644
index 0000000..469a46c
--- /dev/null
+++ b/examples/kw_extraction.ipynb
@@ -0,0 +1,1004 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "slideshow": {
+     "slide_type": "slide"
+    },
+    "toc": true
+   },
+   "source": [
+    "<h1>Table of Contents<span class=\"tocSkip\"></span></h1>\n",
+    "<div class=\"toc\"><ul class=\"toc-item\"><li><span><a href=\"#Load-Data\" data-toc-modified-id=\"Load-Data-1\"><span class=\"toc-item-num\">1&nbsp;&nbsp;</span>Load Data</a></span><ul class=\"toc-item\"><li><span><a href=\"#Prepare-Text-Data\" data-toc-modified-id=\"Prepare-Text-Data-1.1\"><span class=\"toc-item-num\">1.1&nbsp;&nbsp;</span>Prepare Text Data</a></span></li><li><span><a href=\"#Show-Model-Topics\" data-toc-modified-id=\"Show-Model-Topics-1.2\"><span class=\"toc-item-num\">1.2&nbsp;&nbsp;</span>Show Model Topics</a></span></li><li><span><a href=\"#Extract-Keywords\" data-toc-modified-id=\"Extract-Keywords-1.3\"><span class=\"toc-item-num\">1.3&nbsp;&nbsp;</span>Extract Keywords</a></span></li><li><span><a href=\"#Translate-Output\" data-toc-modified-id=\"Translate-Output-1.4\"><span class=\"toc-item-num\">1.4&nbsp;&nbsp;</span>Translate Output</a></span></li><li><span><a href=\"#Organize-by-Part-of-Speech\" data-toc-modified-id=\"Organize-by-Part-of-Speech-1.5\"><span class=\"toc-item-num\">1.5&nbsp;&nbsp;</span>Organize by Part of Speech</a></span></li><li><span><a href=\"#Get-TFIDF-Keywords\" data-toc-modified-id=\"Get-TFIDF-Keywords-1.6\"><span class=\"toc-item-num\">1.6&nbsp;&nbsp;</span>Get TFIDF Keywords</a></span></li></ul></li><li><span><a href=\"#Visualization-Functions\" data-toc-modified-id=\"Visualization-Functions-2\"><span class=\"toc-item-num\">2&nbsp;&nbsp;</span>Visualization Functions</a></span><ul class=\"toc-item\"><li><span><a href=\"#Graph-of-Topic-Number-Evaluations\" data-toc-modified-id=\"Graph-of-Topic-Number-Evaluations-2.1\"><span class=\"toc-item-num\">2.1&nbsp;&nbsp;</span>Graph of Topic Number Evaluations</a></span></li><li><span><a href=\"#pyLDAvis-Topic-Visualization\" data-toc-modified-id=\"pyLDAvis-Topic-Visualization-2.2\"><span class=\"toc-item-num\">2.2&nbsp;&nbsp;</span>pyLDAvis Topic Visualization</a></span></li><li><span><a href=\"#Word-Cloud\" data-toc-modified-id=\"Word-Cloud-2.3\"><span class=\"toc-item-num\">2.3&nbsp;&nbsp;</span>Word Cloud</a></span></li><li><span><a href=\"#t-SNE\" data-toc-modified-id=\"t-SNE-2.4\"><span class=\"toc-item-num\">2.4&nbsp;&nbsp;</span>t-SNE</a></span></li></ul></li><li><span><a href=\"#gen_files\" data-toc-modified-id=\"gen_files-3\"><span class=\"toc-item-num\">3&nbsp;&nbsp;</span>gen_files</a></span></li></ul></div>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**kw_extraction**\n",
+    "\n",
+    "This notebook describes the use of [kwx](https://github.com/andrewtavis/kwx) by deriving the top keywords for tweets from the [Twitter US Airline Sentiment](https://www.kaggle.com/crowdflower/twitter-airline-sentiment) dataset. \n",
+    "\n",
+    "Follow the provided link and download the data, rename it `airline_tweets.csv` to be more descriptive, then put it in a `data` directory in the cwd."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2021-01-31T08:59:43.264123Z",
+     "start_time": "2021-01-31T08:59:43.256192Z"
+    },
+    "slideshow": {
+     "slide_type": "skip"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<style>.container { width:99% !important; }</style>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "import os\n",
+    "import sys\n",
+    "\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "\n",
+    "from kwx.utils import load_data, prepare_data\n",
+    "from kwx.utils import organize_by_pos, translate_output\n",
+    "from kwx.model import extract_kws, gen_files\n",
+    "from kwx.visuals import graph_topic_num_evals, pyLDAvis_topics\n",
+    "from kwx.visuals import gen_word_cloud, t_sne\n",
+    "\n",
+    "import matplotlib.pyplot as plt\n",
+    "import seaborn as sns\n",
+    "# Plot settings\n",
+    "sns.set(style=\"darkgrid\")\n",
+    "sns.set(rc={'figure.figsize':(15,5)})\n",
+    "\n",
+    "pd.set_option(\"display.max_rows\", 16) # maximum df rows\n",
+    "pd.set_option('display.max_columns', None) # maximum df columns\n",
+    "from IPython.core.display import display, HTML\n",
+    "display(HTML(\"<style>.container { width:99% !important; }</style>\")) # widens interface\n",
+    "# %matplotlib notebook"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Load Data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2021-01-31T08:37:43.136154Z",
+     "start_time": "2021-01-31T08:37:43.075058Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>text</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>15</th>\n",
+       "      <td>@VirginAmerica SFO-PDX schedule is still MIA.</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>16</th>\n",
+       "      <td>@VirginAmerica So excited for my first cross c...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>17</th>\n",
+       "      <td>@VirginAmerica  I flew from NYC to SFO last we...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>18</th>\n",
+       "      <td>I ❤️ flying @VirginAmerica. ☺️👍</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>19</th>\n",
+       "      <td>@VirginAmerica you know what would be amazingl...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                 text\n",
+       "15      @VirginAmerica SFO-PDX schedule is still MIA.\n",
+       "16  @VirginAmerica So excited for my first cross c...\n",
+       "17  @VirginAmerica  I flew from NYC to SFO last we...\n",
+       "18                    I ❤️ flying @VirginAmerica. ☺️👍\n",
+       "19  @VirginAmerica you know what would be amazingl..."
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df_airline_tweets = load_data(data='data/airline_tweets.csv', target_cols='text')\n",
+    "df_airline_tweets[15:20]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Prepare Text Data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2021-01-31T08:37:45.997763Z",
+     "start_time": "2021-01-31T08:37:45.995221Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "input_language, output_language = 'english', 'english'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2021-01-31T08:39:02.598222Z",
+     "start_time": "2021-01-31T08:37:46.622361Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# The [0] gives us the corpus\n",
+    "# [1] is clean strings for BERT\n",
+    "# [2] the indexes of selected entries if sample_size != 1\n",
+    "text_corpus = prepare_data(\n",
+    "    data=df_airline_tweets,\n",
+    "    target_cols='text',\n",
+    "    input_language=input_language, \n",
+    "    min_freq=2,\n",
+    "    min_word_len=4,\n",
+    "    sample_size=1,\n",
+    ")[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2021-01-31T08:39:02.603650Z",
+     "start_time": "2021-01-31T08:39:02.599983Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[['virginamerica', 'schedule'],\n",
+       " ['virgin_america',\n",
+       "  'cross_country',\n",
+       "  'virginamerica',\n",
+       "  'excited',\n",
+       "  'cross',\n",
+       "  'country',\n",
+       "  'flight',\n",
+       "  'hear',\n",
+       "  'virgin',\n",
+       "  'america'],\n",
+       " ['virginamerica', 'week', 'seat', 'gentleman'],\n",
+       " ['virginamerica'],\n",
+       " ['virginamerica', 'amazingly', 'awesome']]"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "text_corpus[15:20]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Show Model Topics"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2021-01-31T08:39:02.608421Z",
+     "start_time": "2021-01-31T08:39:02.606477Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "num_keywords = 15\n",
+    "num_topics = 10"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2021-01-31T08:40:56.913180Z",
+     "start_time": "2021-01-31T08:39:02.610456Z"
+    },
+    "slideshow": {
+     "slide_type": "slide"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# return_topics=True gives us the topics themselves\n",
+    "topics = extract_kws(\n",
+    "    method='LDA',\n",
+    "    text_corpus=text_corpus,\n",
+    "    clean_texts=None,\n",
+    "    input_language=input_language,\n",
+    "    output_language=None,\n",
+    "    num_keywords=num_keywords,\n",
+    "    num_topics=num_topics,\n",
+    "    corpuses_to_compare=None,\n",
+    "    return_topics=True,\n",
+    "    ignore_words=None,\n",
+    "    min_freq=2,\n",
+    "    min_word_len=4,\n",
+    "    sample_size=1\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2021-01-31T08:40:56.918174Z",
+     "start_time": "2021-01-31T08:40:56.914916Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['flight',\n",
+       " 'cancel',\n",
+       " 'hold',\n",
+       " 'americanair',\n",
+       " 'southwestair',\n",
+       " 'hour',\n",
+       " 'usairway',\n",
+       " 'flightled',\n",
+       " 'cancelled_flightle',\n",
+       " 'wait',\n",
+       " 'minute',\n",
+       " 'cancelled_flighted',\n",
+       " 'time',\n",
+       " 'phone',\n",
+       " 'usairways']"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "topics[0]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Extract Keywords"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2021-01-31T08:40:56.922185Z",
+     "start_time": "2021-01-31T08:40:56.919967Z"
+    },
+    "slideshow": {
+     "slide_type": "slide"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# The following is a string or list of strings to not include in outputs\n",
+    "# This variable is updated by the user if prompt_remove_words=True\n",
+    "ignore_words = None"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2021-01-31T08:40:58.818627Z",
+     "start_time": "2021-01-31T08:40:56.923820Z"
+    },
+    "code_folding": [],
+    "slideshow": {
+     "slide_type": "slide"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "freq_kws = extract_kws(\n",
+    "    method='frequency',\n",
+    "    text_corpus=text_corpus,\n",
+    "    clean_texts=None,\n",
+    "    input_language=input_language,\n",
+    "    output_language=None,\n",
+    "    num_keywords=num_keywords,\n",
+    "    num_topics=num_topics,\n",
+    "    corpuses_to_compare=None,\n",
+    "    return_topics=False,\n",
+    "    ignore_words=None,\n",
+    "    min_freq=2,\n",
+    "    min_word_len=4,\n",
+    "    sample_size=1,\n",
+    "    prompt_remove_words=False\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2021-01-31T08:40:58.826772Z",
+     "start_time": "2021-01-31T08:40:58.821454Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['flight',\n",
+       " 'united',\n",
+       " 'americanair',\n",
+       " 'southwestair',\n",
+       " 'jetblue',\n",
+       " 'usairway',\n",
+       " 'hour',\n",
+       " 'cancel',\n",
+       " 'service',\n",
+       " 'delay',\n",
+       " 'customer',\n",
+       " 'time',\n",
+       " 'usairways',\n",
+       " 'plane',\n",
+       " 'hold']"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "freq_kws"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2021-01-31T08:56:16.501019Z",
+     "start_time": "2021-01-31T08:40:58.829946Z"
+    },
+    "slideshow": {
+     "slide_type": "slide"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The LDA keywords are:\n",
+      "\n",
+      "['united', 'customer', 'jetblue', 'flight', 'book', 'americanair', 'travel', 'change', 'southwestair', 'love', 'usairway', 'hour', 'delay', 'plane', 'gate']\n",
+      "\n",
+      "Are there words that should be removed [y/n]? y\n",
+      "Type or copy word(s) to be removed: united, jetblue, americanair, southwestair, usairway\n",
+      "\n",
+      "\n",
+      "The new LDA keywords are:\n",
+      "\n",
+      "['email', 'check', 'flight', 'cancel', 'agent', 'late', 'virginamerica', 'hold', 'phone', 'service', 'customer', 'wait', 'plane', 'delay', 'gate']\n",
+      "\n",
+      "Are there words that should be removed [y/n]? y\n",
+      "Type or copy word(s) to be removed: virginamerica\n",
+      "\n",
+      "\n",
+      "The new LDA keywords are:\n",
+      "\n",
+      "['luggage', 'customer', 'delay', 'flight', 'late', 'minute', 'change', 'phone', 'service', 'plane', 'lose', 'love', 'fleek', 'hour', 'hold']\n",
+      "\n",
+      "Are there words that should be removed [y/n]? y\n",
+      "Type or copy word(s) to be removed: fleek\n",
+      "\n",
+      "\n",
+      "The new LDA keywords are:\n",
+      "\n",
+      "['love', 'flight', 'usairways', 'hour', 'service', 'cancel', 'lose', 'luggage', 'phone', 'time', 'plane', 'night', 'delay', 'minute', 'wait']\n",
+      "\n",
+      "Are there words that should be removed [y/n]? n\n"
+     ]
+    }
+   ],
+   "source": [
+    "lda_kws = extract_kws(\n",
+    "    method='LDA',\n",
+    "    text_corpus=text_corpus,\n",
+    "    clean_texts=None,\n",
+    "    input_language=input_language,\n",
+    "    output_language=None,\n",
+    "    num_keywords=num_keywords,\n",
+    "    num_topics=num_topics,\n",
+    "    corpuses_to_compare=None,\n",
+    "    return_topics=False,\n",
+    "    ignore_words=None,\n",
+    "    min_freq=2,\n",
+    "    min_word_len=4,\n",
+    "    sample_size=1,\n",
+    "    prompt_remove_words=True\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2021-01-31T08:56:18.468146Z",
+     "start_time": "2021-01-31T08:56:18.463272Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['love',\n",
+       " 'flight',\n",
+       " 'usairways',\n",
+       " 'hour',\n",
+       " 'service',\n",
+       " 'cancel',\n",
+       " 'lose',\n",
+       " 'luggage',\n",
+       " 'phone',\n",
+       " 'time',\n",
+       " 'plane',\n",
+       " 'night',\n",
+       " 'delay',\n",
+       " 'minute',\n",
+       " 'wait']"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "lda_kws"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Translate Output"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# translate_output(\n",
+    "#     outputs=lda_kws, \n",
+    "#     input_language=input_language, \n",
+    "#     output_language='spanish'\n",
+    "# )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Organize by Part of Speech"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2021-01-31T08:58:46.173285Z",
+     "start_time": "2021-01-31T08:58:45.712104Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'Nouns:': [love,\n",
+       "  flight,\n",
+       "  usairways,\n",
+       "  hour,\n",
+       "  service,\n",
+       "  luggage,\n",
+       "  phone,\n",
+       "  time,\n",
+       "  plane,\n",
+       "  night,\n",
+       "  minute,\n",
+       "  delay],\n",
+       " 'Verbs:': [cancel, lose, wait]}"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "organize_by_pos(outputs=lda_kws, output_language=output_language)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Get TFIDF Keywords"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2021-01-31T08:58:57.939951Z",
+     "start_time": "2021-01-31T08:58:57.927841Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "df_united = df_airline_tweets[\n",
+    "    df_airline_tweets['text'].str.contains(\"united\")\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2021-01-31T08:59:19.317576Z",
+     "start_time": "2021-01-31T08:58:58.969188Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# The [0] gives us the corpus\n",
+    "# [1] is clean strings for BERT\n",
+    "# [2] the indexes of selected entries if sample_size != 1\n",
+    "united_corpus = prepare_data(\n",
+    "    data=df_united,\n",
+    "    target_cols='text',\n",
+    "    input_language=input_language, \n",
+    "    min_freq=2,\n",
+    "    min_word_len=4,\n",
+    "    sample_size=1,\n",
+    ")[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2021-01-31T08:59:49.557969Z",
+     "start_time": "2021-01-31T08:59:49.553165Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "df_other_airlines = df_airline_tweets.loc[\n",
+    "    np.setdiff1d(df_airline_tweets.index, df_united.index)\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2021-01-31T09:00:55.419996Z",
+     "start_time": "2021-01-31T08:59:55.210493Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# The [0] gives us the corpus\n",
+    "# [1] is clean strings for BERT\n",
+    "# [2] the indexes of selected entries if sample_size != 1\n",
+    "other_airlines_corpus = prepare_data(\n",
+    "    data=df_other_airlines,\n",
+    "    target_cols='text',\n",
+    "    input_language=input_language, \n",
+    "    min_freq=2,\n",
+    "    min_word_len=4,\n",
+    "    sample_size=1,\n",
+    ")[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2021-01-31T09:01:24.129883Z",
+     "start_time": "2021-01-31T09:00:55.421683Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The FREQUENCY keywords are:\n",
+      "\n",
+      "['united', 'flight', 'delay', 'service', 'customer', 'hour', 'time', 'plane', 'cancel', 'wait']\n",
+      "\n",
+      "Are there words that should be removed [y/n]? n\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Words that are prevalent in United tweets compared to others\n",
+    "tfidf_kws = extract_kws(\n",
+    "   method='tfidf',\n",
+    "   text_corpus=united_corpus,\n",
+    "   clean_texts=None,\n",
+    "   input_language=input_language,\n",
+    "   output_language=None,\n",
+    "   num_keywords=10,\n",
+    "   num_topics=10,\n",
+    "   corpuses_to_compare=other_airlines_corpus,\n",
+    "   return_topics=False,\n",
+    "   ignore_words=ignore_words,\n",
+    "   min_freq=2,\n",
+    "   min_word_len=4,\n",
+    "   sample_size=1,\n",
+    "   prompt_remove_words=False,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2021-01-31T09:01:24.136342Z",
+     "start_time": "2021-01-31T09:01:24.132593Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['united',\n",
+       " 'flight',\n",
+       " 'delay',\n",
+       " 'service',\n",
+       " 'customer',\n",
+       " 'hour',\n",
+       " 'time',\n",
+       " 'plane',\n",
+       " 'cancel',\n",
+       " 'wait']"
+      ]
+     },
+     "execution_count": 23,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "tfidf_kws"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Visualization Functions"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Graph of Topic Number Evaluations"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "topic_nums_to_compare = list(range(5, 16))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Commented out to avoid long run times\n",
+    "# figure = graph_topic_num_evals(\n",
+    "#     method=['lda', 'bert', 'lda_bert'],\n",
+    "#     text_corpus=text_corpus, \n",
+    "#     input_language=input_language,\n",
+    "#     num_keywords=num_keywords,\n",
+    "#     topic_nums_to_compare=topic_nums_to_compare,\n",
+    "#     sample_size=1,\n",
+    "#     metrics=True,  # stability and coherence\n",
+    "#     save_file=False, # True for pwd or directory name\n",
+    "#     return_ideal_metrics=False, # don't  output ideal model instead of plot\n",
+    "#     verbose=False,  # so progress bar isn't broken online\n",
+    "# )\n",
+    "# plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## pyLDAvis Topic Visualization"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2020-11-09T19:15:17.269799Z",
+     "start_time": "2020-11-09T19:15:17.267481Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# Commented out as it changes the output dimensions due to its width\n",
+    "# pyLDAvis_topics(\n",
+    "#     method='lda',\n",
+    "#     text_corpus=text_corpus, \n",
+    "#     input_language=input_language,\n",
+    "#     num_topics=num_topics,\n",
+    "#     save_file=False, # True for pwd or directory name\n",
+    "#     display_ipython=True,  # <- show in Jupyter notebook\n",
+    "# )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Word Cloud"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ignore_words = [\n",
+    "    \"jetblue\",\n",
+    "    \"united\",\n",
+    "    \"americanair\",\n",
+    "    \"usairway\",\n",
+    "    \"southwestair\",\n",
+    "    \"virginamerica\",\n",
+    "    \"fleek\",\n",
+    "    \"usairways\",\n",
+    "    \"flightled\",\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "gen_word_cloud(\n",
+    "    text_corpus=text_corpus,\n",
+    "    input_language=input_language,\n",
+    "    ignore_words=ignore_words,\n",
+    "    height=500,\n",
+    "    save_file=False, # True for pwd or directory name\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## t-SNE"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "t_sne(\n",
+    "    dimension=\"both\", \n",
+    "    text_corpus=text_corpus, \n",
+    "    num_topics=num_topics, \n",
+    "    remove_3d_outliers=True,\n",
+    "    fig_size=(20, 10),\n",
+    "    save_file=False, # True for pwd or directory name\n",
+    ")\n",
+    "\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# gen_files"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "[kwx.model.gen_files](https://github.com/andrewtavis/kwx/blob/main/kwx/model.py) does the following:\n",
+    "\n",
+    "- Computes the optimal number of topics for the given model type(s)\n",
+    "\n",
+    "- Extracts the most frequent keywords and those for the optimal topic model\n",
+    "\n",
+    "- Allows the user to refine keywords given their intuitions\n",
+    "\n",
+    "- Plots the desired visuals\n",
+    "\n",
+    "- Puts all of the above in a directory or zipped file"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Commented out to avoid long run times\n",
+    "# gen_files(\n",
+    "#     method=['lda', 'bert', 'lda_bert'],\n",
+    "#     text_corpus=text_corpus, \n",
+    "#     input_language=input_language,\n",
+    "#     output_language=None,\n",
+    "#     num_keywords=num_keywords,\n",
+    "#     topic_nums_to_compare=topic_nums_to_compare,\n",
+    "#     ignore_words=ignore_words,\n",
+    "#     min_freq=2,\n",
+    "#     min_word_len=4,\n",
+    "#     sample_size=1,\n",
+    "#     prompt_remove_words=True,\n",
+    "#     verbose=False,  # so progress bar isn't broken online\n",
+    "#     org_by_pos=False,  # organize keywords by part of speech\n",
+    "#     incl_visuals=['topic_num_evals', 'word_cloud', 'pyLDAvis'],  # t_sne not zipping properly\n",
+    "#     zip_results=True,\n",
+    "# )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.7"
+  },
+  "toc": {
+   "base_numbering": 1,
+   "nav_menu": {},
+   "number_sections": true,
+   "sideBar": true,
+   "skip_h1_title": false,
+   "title_cell": "Table of Contents",
+   "title_sidebar": "Contents",
+   "toc_cell": true,
+   "toc_position": {},
+   "toc_section_display": true,
+   "toc_window_display": false
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

	text
15	@VirginAmerica SFO-PDX schedule is still MIA.
16	@VirginAmerica So excited for my first cross c...
17	@VirginAmerica I flew from NYC to SFO last we...
18	I ❤️ flying @VirginAmerica. ☺️👍
19	@VirginAmerica you know what would be amazingl...