diff --git a/examples/kw_extraction.ipynb b/examples/kw_extraction.ipynb
deleted file mode 100644
index 469a46c..0000000
--- a/examples/kw_extraction.ipynb
+++ /dev/null
@@ -1,1004 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {
- "slideshow": {
- "slide_type": "slide"
- },
- "toc": true
- },
- "source": [
- "
Table of Contents
\n",
- ""
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "**kw_extraction**\n",
- "\n",
- "This notebook describes the use of [kwx](https://github.com/andrewtavis/kwx) by deriving the top keywords for tweets from the [Twitter US Airline Sentiment](https://www.kaggle.com/crowdflower/twitter-airline-sentiment) dataset. \n",
- "\n",
- "Follow the provided link and download the data, rename it `airline_tweets.csv` to be more descriptive, then put it in a `data` directory in the cwd."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 19,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2021-01-31T08:59:43.264123Z",
- "start_time": "2021-01-31T08:59:43.256192Z"
- },
- "slideshow": {
- "slide_type": "skip"
- }
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- ""
- ],
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
- "source": [
- "import os\n",
- "import sys\n",
- "\n",
- "import numpy as np\n",
- "import pandas as pd\n",
- "\n",
- "from kwx.utils import load_data, prepare_data\n",
- "from kwx.utils import organize_by_pos, translate_output\n",
- "from kwx.model import extract_kws, gen_files\n",
- "from kwx.visuals import graph_topic_num_evals, pyLDAvis_topics\n",
- "from kwx.visuals import gen_word_cloud, t_sne\n",
- "\n",
- "import matplotlib.pyplot as plt\n",
- "import seaborn as sns\n",
- "# Plot settings\n",
- "sns.set(style=\"darkgrid\")\n",
- "sns.set(rc={'figure.figsize':(15,5)})\n",
- "\n",
- "pd.set_option(\"display.max_rows\", 16) # maximum df rows\n",
- "pd.set_option('display.max_columns', None) # maximum df columns\n",
- "from IPython.core.display import display, HTML\n",
- "display(HTML(\"\")) # widens interface\n",
- "# %matplotlib notebook"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Load Data"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2021-01-31T08:37:43.136154Z",
- "start_time": "2021-01-31T08:37:43.075058Z"
- }
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " text | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 15 | \n",
- " @VirginAmerica SFO-PDX schedule is still MIA. | \n",
- "
\n",
- " \n",
- " 16 | \n",
- " @VirginAmerica So excited for my first cross c... | \n",
- "
\n",
- " \n",
- " 17 | \n",
- " @VirginAmerica I flew from NYC to SFO last we... | \n",
- "
\n",
- " \n",
- " 18 | \n",
- " I ❤️ flying @VirginAmerica. ☺️👍 | \n",
- "
\n",
- " \n",
- " 19 | \n",
- " @VirginAmerica you know what would be amazingl... | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " text\n",
- "15 @VirginAmerica SFO-PDX schedule is still MIA.\n",
- "16 @VirginAmerica So excited for my first cross c...\n",
- "17 @VirginAmerica I flew from NYC to SFO last we...\n",
- "18 I ❤️ flying @VirginAmerica. ☺️👍\n",
- "19 @VirginAmerica you know what would be amazingl..."
- ]
- },
- "execution_count": 2,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df_airline_tweets = load_data(data='data/airline_tweets.csv', target_cols='text')\n",
- "df_airline_tweets[15:20]"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Prepare Text Data"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2021-01-31T08:37:45.997763Z",
- "start_time": "2021-01-31T08:37:45.995221Z"
- }
- },
- "outputs": [],
- "source": [
- "input_language, output_language = 'english', 'english'"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2021-01-31T08:39:02.598222Z",
- "start_time": "2021-01-31T08:37:46.622361Z"
- }
- },
- "outputs": [],
- "source": [
- "# The [0] gives us the corpus\n",
- "# [1] is clean strings for BERT\n",
- "# [2] the indexes of selected entries if sample_size != 1\n",
- "text_corpus = prepare_data(\n",
- " data=df_airline_tweets,\n",
- " target_cols='text',\n",
- " input_language=input_language, \n",
- " min_freq=2,\n",
- " min_word_len=4,\n",
- " sample_size=1,\n",
- ")[0]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2021-01-31T08:39:02.603650Z",
- "start_time": "2021-01-31T08:39:02.599983Z"
- }
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "[['virginamerica', 'schedule'],\n",
- " ['virgin_america',\n",
- " 'cross_country',\n",
- " 'virginamerica',\n",
- " 'excited',\n",
- " 'cross',\n",
- " 'country',\n",
- " 'flight',\n",
- " 'hear',\n",
- " 'virgin',\n",
- " 'america'],\n",
- " ['virginamerica', 'week', 'seat', 'gentleman'],\n",
- " ['virginamerica'],\n",
- " ['virginamerica', 'amazingly', 'awesome']]"
- ]
- },
- "execution_count": 5,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "text_corpus[15:20]"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Show Model Topics"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2021-01-31T08:39:02.608421Z",
- "start_time": "2021-01-31T08:39:02.606477Z"
- }
- },
- "outputs": [],
- "source": [
- "num_keywords = 15\n",
- "num_topics = 10"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2021-01-31T08:40:56.913180Z",
- "start_time": "2021-01-31T08:39:02.610456Z"
- },
- "slideshow": {
- "slide_type": "slide"
- }
- },
- "outputs": [],
- "source": [
- "# return_topics=True gives us the topics themselves\n",
- "topics = extract_kws(\n",
- " method='LDA',\n",
- " text_corpus=text_corpus,\n",
- " clean_texts=None,\n",
- " input_language=input_language,\n",
- " output_language=None,\n",
- " num_keywords=num_keywords,\n",
- " num_topics=num_topics,\n",
- " corpuses_to_compare=None,\n",
- " return_topics=True,\n",
- " ignore_words=None,\n",
- " min_freq=2,\n",
- " min_word_len=4,\n",
- " sample_size=1\n",
- ")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2021-01-31T08:40:56.918174Z",
- "start_time": "2021-01-31T08:40:56.914916Z"
- }
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "['flight',\n",
- " 'cancel',\n",
- " 'hold',\n",
- " 'americanair',\n",
- " 'southwestair',\n",
- " 'hour',\n",
- " 'usairway',\n",
- " 'flightled',\n",
- " 'cancelled_flightle',\n",
- " 'wait',\n",
- " 'minute',\n",
- " 'cancelled_flighted',\n",
- " 'time',\n",
- " 'phone',\n",
- " 'usairways']"
- ]
- },
- "execution_count": 8,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "topics[0]"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Extract Keywords"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 9,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2021-01-31T08:40:56.922185Z",
- "start_time": "2021-01-31T08:40:56.919967Z"
- },
- "slideshow": {
- "slide_type": "slide"
- }
- },
- "outputs": [],
- "source": [
- "# The following is a string or list of strings to not include in outputs\n",
- "# This variable is updated by the user if prompt_remove_words=True\n",
- "ignore_words = None"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 10,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2021-01-31T08:40:58.818627Z",
- "start_time": "2021-01-31T08:40:56.923820Z"
- },
- "code_folding": [],
- "slideshow": {
- "slide_type": "slide"
- }
- },
- "outputs": [],
- "source": [
- "freq_kws = extract_kws(\n",
- " method='frequency',\n",
- " text_corpus=text_corpus,\n",
- " clean_texts=None,\n",
- " input_language=input_language,\n",
- " output_language=None,\n",
- " num_keywords=num_keywords,\n",
- " num_topics=num_topics,\n",
- " corpuses_to_compare=None,\n",
- " return_topics=False,\n",
- " ignore_words=None,\n",
- " min_freq=2,\n",
- " min_word_len=4,\n",
- " sample_size=1,\n",
- " prompt_remove_words=False\n",
- ")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 11,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2021-01-31T08:40:58.826772Z",
- "start_time": "2021-01-31T08:40:58.821454Z"
- }
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "['flight',\n",
- " 'united',\n",
- " 'americanair',\n",
- " 'southwestair',\n",
- " 'jetblue',\n",
- " 'usairway',\n",
- " 'hour',\n",
- " 'cancel',\n",
- " 'service',\n",
- " 'delay',\n",
- " 'customer',\n",
- " 'time',\n",
- " 'usairways',\n",
- " 'plane',\n",
- " 'hold']"
- ]
- },
- "execution_count": 11,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "freq_kws"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 12,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2021-01-31T08:56:16.501019Z",
- "start_time": "2021-01-31T08:40:58.829946Z"
- },
- "slideshow": {
- "slide_type": "slide"
- }
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "The LDA keywords are:\n",
- "\n",
- "['united', 'customer', 'jetblue', 'flight', 'book', 'americanair', 'travel', 'change', 'southwestair', 'love', 'usairway', 'hour', 'delay', 'plane', 'gate']\n",
- "\n",
- "Are there words that should be removed [y/n]? y\n",
- "Type or copy word(s) to be removed: united, jetblue, americanair, southwestair, usairway\n",
- "\n",
- "\n",
- "The new LDA keywords are:\n",
- "\n",
- "['email', 'check', 'flight', 'cancel', 'agent', 'late', 'virginamerica', 'hold', 'phone', 'service', 'customer', 'wait', 'plane', 'delay', 'gate']\n",
- "\n",
- "Are there words that should be removed [y/n]? y\n",
- "Type or copy word(s) to be removed: virginamerica\n",
- "\n",
- "\n",
- "The new LDA keywords are:\n",
- "\n",
- "['luggage', 'customer', 'delay', 'flight', 'late', 'minute', 'change', 'phone', 'service', 'plane', 'lose', 'love', 'fleek', 'hour', 'hold']\n",
- "\n",
- "Are there words that should be removed [y/n]? y\n",
- "Type or copy word(s) to be removed: fleek\n",
- "\n",
- "\n",
- "The new LDA keywords are:\n",
- "\n",
- "['love', 'flight', 'usairways', 'hour', 'service', 'cancel', 'lose', 'luggage', 'phone', 'time', 'plane', 'night', 'delay', 'minute', 'wait']\n",
- "\n",
- "Are there words that should be removed [y/n]? n\n"
- ]
- }
- ],
- "source": [
- "lda_kws = extract_kws(\n",
- " method='LDA',\n",
- " text_corpus=text_corpus,\n",
- " clean_texts=None,\n",
- " input_language=input_language,\n",
- " output_language=None,\n",
- " num_keywords=num_keywords,\n",
- " num_topics=num_topics,\n",
- " corpuses_to_compare=None,\n",
- " return_topics=False,\n",
- " ignore_words=None,\n",
- " min_freq=2,\n",
- " min_word_len=4,\n",
- " sample_size=1,\n",
- " prompt_remove_words=True\n",
- ")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 13,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2021-01-31T08:56:18.468146Z",
- "start_time": "2021-01-31T08:56:18.463272Z"
- }
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "['love',\n",
- " 'flight',\n",
- " 'usairways',\n",
- " 'hour',\n",
- " 'service',\n",
- " 'cancel',\n",
- " 'lose',\n",
- " 'luggage',\n",
- " 'phone',\n",
- " 'time',\n",
- " 'plane',\n",
- " 'night',\n",
- " 'delay',\n",
- " 'minute',\n",
- " 'wait']"
- ]
- },
- "execution_count": 13,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "lda_kws"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Translate Output"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# translate_output(\n",
- "# outputs=lda_kws, \n",
- "# input_language=input_language, \n",
- "# output_language='spanish'\n",
- "# )"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Organize by Part of Speech"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 15,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2021-01-31T08:58:46.173285Z",
- "start_time": "2021-01-31T08:58:45.712104Z"
- }
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "{'Nouns:': [love,\n",
- " flight,\n",
- " usairways,\n",
- " hour,\n",
- " service,\n",
- " luggage,\n",
- " phone,\n",
- " time,\n",
- " plane,\n",
- " night,\n",
- " minute,\n",
- " delay],\n",
- " 'Verbs:': [cancel, lose, wait]}"
- ]
- },
- "execution_count": 15,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "organize_by_pos(outputs=lda_kws, output_language=output_language)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Get TFIDF Keywords"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 16,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2021-01-31T08:58:57.939951Z",
- "start_time": "2021-01-31T08:58:57.927841Z"
- }
- },
- "outputs": [],
- "source": [
- "df_united = df_airline_tweets[\n",
- " df_airline_tweets['text'].str.contains(\"united\")\n",
- "]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 17,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2021-01-31T08:59:19.317576Z",
- "start_time": "2021-01-31T08:58:58.969188Z"
- }
- },
- "outputs": [],
- "source": [
- "# The [0] gives us the corpus\n",
- "# [1] is clean strings for BERT\n",
- "# [2] the indexes of selected entries if sample_size != 1\n",
- "united_corpus = prepare_data(\n",
- " data=df_united,\n",
- " target_cols='text',\n",
- " input_language=input_language, \n",
- " min_freq=2,\n",
- " min_word_len=4,\n",
- " sample_size=1,\n",
- ")[0]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 20,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2021-01-31T08:59:49.557969Z",
- "start_time": "2021-01-31T08:59:49.553165Z"
- }
- },
- "outputs": [],
- "source": [
- "df_other_airlines = df_airline_tweets.loc[\n",
- " np.setdiff1d(df_airline_tweets.index, df_united.index)\n",
- "]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 21,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2021-01-31T09:00:55.419996Z",
- "start_time": "2021-01-31T08:59:55.210493Z"
- }
- },
- "outputs": [],
- "source": [
- "# The [0] gives us the corpus\n",
- "# [1] is clean strings for BERT\n",
- "# [2] the indexes of selected entries if sample_size != 1\n",
- "other_airlines_corpus = prepare_data(\n",
- " data=df_other_airlines,\n",
- " target_cols='text',\n",
- " input_language=input_language, \n",
- " min_freq=2,\n",
- " min_word_len=4,\n",
- " sample_size=1,\n",
- ")[0]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 22,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2021-01-31T09:01:24.129883Z",
- "start_time": "2021-01-31T09:00:55.421683Z"
- }
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "The FREQUENCY keywords are:\n",
- "\n",
- "['united', 'flight', 'delay', 'service', 'customer', 'hour', 'time', 'plane', 'cancel', 'wait']\n",
- "\n",
- "Are there words that should be removed [y/n]? n\n"
- ]
- }
- ],
- "source": [
- "# Words that are prevalent in United tweets compared to others\n",
- "tfidf_kws = extract_kws(\n",
- " method='tfidf',\n",
- " text_corpus=united_corpus,\n",
- " clean_texts=None,\n",
- " input_language=input_language,\n",
- " output_language=None,\n",
- " num_keywords=10,\n",
- " num_topics=10,\n",
- " corpuses_to_compare=other_airlines_corpus,\n",
- " return_topics=False,\n",
- " ignore_words=ignore_words,\n",
- " min_freq=2,\n",
- " min_word_len=4,\n",
- " sample_size=1,\n",
- " prompt_remove_words=False,\n",
- ")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 23,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2021-01-31T09:01:24.136342Z",
- "start_time": "2021-01-31T09:01:24.132593Z"
- }
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "['united',\n",
- " 'flight',\n",
- " 'delay',\n",
- " 'service',\n",
- " 'customer',\n",
- " 'hour',\n",
- " 'time',\n",
- " 'plane',\n",
- " 'cancel',\n",
- " 'wait']"
- ]
- },
- "execution_count": 23,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "tfidf_kws"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Visualization Functions"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Graph of Topic Number Evaluations"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "topic_nums_to_compare = list(range(5, 16))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Commented out to avoid long run times\n",
- "# figure = graph_topic_num_evals(\n",
- "# method=['lda', 'bert', 'lda_bert'],\n",
- "# text_corpus=text_corpus, \n",
- "# input_language=input_language,\n",
- "# num_keywords=num_keywords,\n",
- "# topic_nums_to_compare=topic_nums_to_compare,\n",
- "# sample_size=1,\n",
- "# metrics=True, # stability and coherence\n",
- "# save_file=False, # True for pwd or directory name\n",
- "# return_ideal_metrics=False, # don't output ideal model instead of plot\n",
- "# verbose=False, # so progress bar isn't broken online\n",
- "# )\n",
- "# plt.show()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## pyLDAvis Topic Visualization"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 28,
- "metadata": {
- "ExecuteTime": {
- "end_time": "2020-11-09T19:15:17.269799Z",
- "start_time": "2020-11-09T19:15:17.267481Z"
- }
- },
- "outputs": [],
- "source": [
- "# Commented out as it changes the output dimensions due to its width\n",
- "# pyLDAvis_topics(\n",
- "# method='lda',\n",
- "# text_corpus=text_corpus, \n",
- "# input_language=input_language,\n",
- "# num_topics=num_topics,\n",
- "# save_file=False, # True for pwd or directory name\n",
- "# display_ipython=True, # <- show in Jupyter notebook\n",
- "# )"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Word Cloud"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "ignore_words = [\n",
- " \"jetblue\",\n",
- " \"united\",\n",
- " \"americanair\",\n",
- " \"usairway\",\n",
- " \"southwestair\",\n",
- " \"virginamerica\",\n",
- " \"fleek\",\n",
- " \"usairways\",\n",
- " \"flightled\",\n",
- "]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "gen_word_cloud(\n",
- " text_corpus=text_corpus,\n",
- " input_language=input_language,\n",
- " ignore_words=ignore_words,\n",
- " height=500,\n",
- " save_file=False, # True for pwd or directory name\n",
- ")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## t-SNE"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "t_sne(\n",
- " dimension=\"both\", \n",
- " text_corpus=text_corpus, \n",
- " num_topics=num_topics, \n",
- " remove_3d_outliers=True,\n",
- " fig_size=(20, 10),\n",
- " save_file=False, # True for pwd or directory name\n",
- ")\n",
- "\n",
- "plt.show()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# gen_files"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "[kwx.model.gen_files](https://github.com/andrewtavis/kwx/blob/main/kwx/model.py) does the following:\n",
- "\n",
- "- Computes the optimal number of topics for the given model type(s)\n",
- "\n",
- "- Extracts the most frequent keywords and those for the optimal topic model\n",
- "\n",
- "- Allows the user to refine keywords given their intuitions\n",
- "\n",
- "- Plots the desired visuals\n",
- "\n",
- "- Puts all of the above in a directory or zipped file"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Commented out to avoid long run times\n",
- "# gen_files(\n",
- "# method=['lda', 'bert', 'lda_bert'],\n",
- "# text_corpus=text_corpus, \n",
- "# input_language=input_language,\n",
- "# output_language=None,\n",
- "# num_keywords=num_keywords,\n",
- "# topic_nums_to_compare=topic_nums_to_compare,\n",
- "# ignore_words=ignore_words,\n",
- "# min_freq=2,\n",
- "# min_word_len=4,\n",
- "# sample_size=1,\n",
- "# prompt_remove_words=True,\n",
- "# verbose=False, # so progress bar isn't broken online\n",
- "# org_by_pos=False, # organize keywords by part of speech\n",
- "# incl_visuals=['topic_num_evals', 'word_cloud', 'pyLDAvis'], # t_sne not zipping properly\n",
- "# zip_results=True,\n",
- "# )"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.7.7"
- },
- "toc": {
- "base_numbering": 1,
- "nav_menu": {},
- "number_sections": true,
- "sideBar": true,
- "skip_h1_title": false,
- "title_cell": "Table of Contents",
- "title_sidebar": "Contents",
- "toc_cell": true,
- "toc_position": {},
- "toc_section_display": true,
- "toc_window_display": false
- }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}