From 019b3028ddc732f73f6c7c7ad0867a221764f04d Mon Sep 17 00:00:00 2001 From: Andrew McAllister Date: Sun, 31 Jan 2021 09:31:56 +0100 Subject: [PATCH] Add output to readme example --- README.md | 21 ++- examples/kw_extraction.ipynb | 266 ++++++++++++++++++++++++++++------- 2 files changed, 235 insertions(+), 52 deletions(-) diff --git a/README.md b/README.md index 45010e8..9b4c664 100644 --- a/README.md +++ b/README.md @@ -81,8 +81,8 @@ The following outlines using kwx to derive keywords from a text corpus with `pro from kwx.utils import prepare_data from kwx.model import extract_kws -input_language = "english" -num_keywords = 10 +input_language = "english" # see kwx.languages for options +num_keywords = 15 num_topics = 10 ignore_words = ["words", "user", "knows", "they", "don't", "want"] @@ -110,6 +110,23 @@ bert_kws = extract_kws( ) ``` +``` +The BERT keywords are: + +['time', 'flight', 'plane', 'southwestair', 'ticket', 'cancel', 'united', 'baggage', +'love', 'virginamerica', 'service', 'customer', 'delay', 'late', 'hour'] + +Are there words that should be removed [y/n]? y +Type or copy word(s) to be removed: southwestair, united, virginamerica + +The new BERT keywords are: + +['late', 'baggage', 'service', 'flight', 'time', 'love', 'book', 'customer', +'response', 'hold', 'hour', 'cancel', 'cancelled_flighted', 'delay', 'plane'] + +Are there words that should be removed [y/n]? n +``` + The model will be re-ran until all words known to be unreasonable are removed for a suitable output. `kwx.model.gen_files` could also be used as a run-all function that produces a directory with a keyword text file and visuals (for experienced users wanting quick results). # Visuals diff --git a/examples/kw_extraction.ipynb b/examples/kw_extraction.ipynb index 8746c41..f9def64 100644 --- a/examples/kw_extraction.ipynb +++ b/examples/kw_extraction.ipynb @@ -26,11 +26,11 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "metadata": { "ExecuteTime": { - "end_time": "2021-01-29T19:27:58.045088Z", - "start_time": "2021-01-29T19:27:56.856283Z" + "end_time": "2021-01-31T07:59:40.153415Z", + "start_time": "2021-01-31T07:59:33.083450Z" }, "slideshow": { "slide_type": "skip" @@ -83,24 +83,75 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "metadata": { "ExecuteTime": { - "end_time": "2021-01-29T19:27:59.465869Z", - "start_time": "2021-01-29T19:27:59.369975Z" + "end_time": "2021-01-31T07:59:43.242510Z", + "start_time": "2021-01-31T07:59:43.169963Z" } }, "outputs": [ { - "ename": "AttributeError", - "evalue": "module 'kwgen' has no attribute 'utils'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdf_corpus\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mkwgen\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mutils\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'data/Tweets.csv'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mdf_corpus\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mloc\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;36m5\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mAttributeError\u001b[0m: module 'kwgen' has no attribute 'utils'" - ] + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
text
15@VirginAmerica SFO-PDX schedule is still MIA.
16@VirginAmerica So excited for my first cross c...
17@VirginAmerica I flew from NYC to SFO last we...
18I ❤️ flying @VirginAmerica. ☺️👍
19@VirginAmerica you know what would be amazingl...
\n", + "
" + ], + "text/plain": [ + " text\n", + "15 @VirginAmerica SFO-PDX schedule is still MIA.\n", + "16 @VirginAmerica So excited for my first cross c...\n", + "17 @VirginAmerica I flew from NYC to SFO last we...\n", + "18 I ❤️ flying @VirginAmerica. ☺️👍\n", + "19 @VirginAmerica you know what would be amazingl..." + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ @@ -117,11 +168,11 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "metadata": { "ExecuteTime": { - "end_time": "2021-01-30T16:02:53.717597Z", - "start_time": "2021-01-30T16:02:53.715299Z" + "end_time": "2021-01-31T07:59:46.560388Z", + "start_time": "2021-01-31T07:59:46.557648Z" } }, "outputs": [], @@ -131,14 +182,19 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, + "execution_count": 5, + "metadata": { + "ExecuteTime": { + "end_time": "2021-01-31T08:01:14.767287Z", + "start_time": "2021-01-31T07:59:55.997765Z" + } + }, "outputs": [], "source": [ "# The [0] gives us the corpus\n", "# [1] is clean strings for BERT\n", "# [2] the indexes of selected entries if sample_size != 1\n", - "text_corpus = prepare_text_data(\n", + "text_corpus = prepare_data(\n", " data=df_airline_tweets,\n", " target_cols='text',\n", " input_language=input_language, \n", @@ -150,9 +206,38 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 6, + "metadata": { + "ExecuteTime": { + "end_time": "2021-01-31T08:01:16.802475Z", + "start_time": "2021-01-31T08:01:16.798131Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[['virginamerica', 'schedule'],\n", + " ['virgin_america',\n", + " 'cross_country',\n", + " 'virginamerica',\n", + " 'excited',\n", + " 'cross',\n", + " 'country',\n", + " 'flight',\n", + " 'hear',\n", + " 'virgin',\n", + " 'america'],\n", + " ['virginamerica', 'week', 'seat', 'gentleman'],\n", + " ['virginamerica'],\n", + " ['virginamerica', 'amazingly', 'awesome']]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "text_corpus[15:20]" ] @@ -169,8 +254,23 @@ "execution_count": 11, "metadata": { "ExecuteTime": { - "end_time": "2020-11-09T19:02:40.001462Z", - "start_time": "2020-11-09T19:02:31.106061Z" + "end_time": "2021-01-31T08:25:37.421746Z", + "start_time": "2021-01-31T08:25:37.419367Z" + } + }, + "outputs": [], + "source": [ + "num_keywords = 15\n", + "num_topics = 10" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "ExecuteTime": { + "end_time": "2021-01-31T08:03:30.958104Z", + "start_time": "2021-01-31T08:01:36.750490Z" }, "slideshow": { "slide_type": "slide" @@ -185,8 +285,8 @@ " clean_texts=None,\n", " input_language=input_language,\n", " output_language=None,\n", - " num_keywords=15,\n", - " num_topics=10,\n", + " num_keywords=num_keywords,\n", + " num_topics=num_topics,\n", " corpuses_to_compare=None,\n", " return_topics=True,\n", " ignore_words=None,\n", @@ -198,9 +298,39 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 8, + "metadata": { + "ExecuteTime": { + "end_time": "2021-01-31T08:03:58.445759Z", + "start_time": "2021-01-31T08:03:58.442117Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['hour',\n", + " 'usairway',\n", + " 'hold',\n", + " 'southwestair',\n", + " 'flight',\n", + " 'wait',\n", + " 'united',\n", + " 'americanair',\n", + " 'minute',\n", + " 'time',\n", + " 'luggage',\n", + " 'phone',\n", + " 'follow',\n", + " 'answer',\n", + " 'baggage']" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "topics[0]" ] @@ -214,11 +344,11 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 9, "metadata": { "ExecuteTime": { - "end_time": "2020-11-09T20:23:19.788818Z", - "start_time": "2020-11-09T20:23:19.786309Z" + "end_time": "2021-01-31T08:04:11.372380Z", + "start_time": "2021-01-31T08:04:11.369974Z" }, "slideshow": { "slide_type": "slide" @@ -227,7 +357,7 @@ "outputs": [], "source": [ "# The following is a string or list of strings to not include in outputs\n", - "# This variable is updated in the keyword selection step of gen_files\n", + "# This variable is updated by the user if prompt_remove_words=True\n", "ignore_words = None" ] }, @@ -252,8 +382,8 @@ " clean_texts=None,\n", " input_language=input_language,\n", " output_language=None,\n", - " num_keywords=15,\n", - " num_topics=10,\n", + " num_keywords=num_keywords,\n", + " num_topics=num_topics,\n", " corpuses_to_compare=None,\n", " return_topics=False,\n", " ignore_words=None,\n", @@ -275,17 +405,53 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 10, "metadata": { "ExecuteTime": { - "end_time": "2020-11-09T19:02:53.171585Z", - "start_time": "2020-11-09T19:02:44.447035Z" + "end_time": "2021-01-31T08:19:14.945990Z", + "start_time": "2021-01-31T08:04:25.526000Z" }, "slideshow": { "slide_type": "slide" } }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The LDA keywords are:\n", + "\n", + "['jetblue', 'united', 'flight', 'service', 'airline', 'southwestair', 'time', 'baggage', 'crew', 'attendant', 'cancel', 'americanair', 'hour', 'usairway', 'delay']\n", + "\n", + "Are there words that should be removed [y/n]? y\n", + "Type or copy word(s) to be removed: jetblue, united, americanair, usairway\n", + "\n", + "\n", + "The new LDA keywords are:\n", + "\n", + "['book', 'flight', 'southwestair', 'seat', 'service', 'plane', 'hour', 'hold', 'cancel', 'cancelled_flighted', 'delay', 'late', 'lose', 'baggage', 'airline']\n", + "\n", + "Are there words that should be removed [y/n]? y\n", + "Type or copy word(s) to be removed: southwestair\n", + "\n", + "\n", + "The new LDA keywords are:\n", + "\n", + "['time', 'flight', 'plane', 'fleek', 'ticket', 'cancel', 'usairways', 'baggage', 'love', 'virginamerica', 'service', 'customer', 'delay', 'late', 'hour']\n", + "\n", + "Are there words that should be removed [y/n]? y\n", + "Type or copy word(s) to be removed: virginamerica, fleek, usairways\n", + "\n", + "\n", + "The new LDA keywords are:\n", + "\n", + "['late', 'fleet', 'service', 'flight', 'time', 'love', 'book', 'customer', 'response', 'hold', 'hour', 'cancel', 'cancelled_flighted', 'delay', 'plane']\n", + "\n", + "Are there words that should be removed [y/n]? n\n" + ] + } + ], "source": [ "lda_kws = extract_kws(\n", " method='LDA',\n", @@ -293,8 +459,8 @@ " clean_texts=None,\n", " input_language=input_language,\n", " output_language=None,\n", - " num_keywords=15,\n", - " num_topics=10,\n", + " num_keywords=num_keywords,\n", + " num_topics=num_topics,\n", " corpuses_to_compare=None,\n", " return_topics=False,\n", " ignore_words=None,\n", @@ -377,7 +543,7 @@ "# The [0] gives us the corpus\n", "# [1] is clean strings for BERT\n", "# [2] the indexes of selected entries if sample_size != 1\n", - "united_corpus = prepare_text_data(\n", + "united_corpus = prepare_data(\n", " data=df_united,\n", " target_cols='text',\n", " input_language=input_language, \n", @@ -407,7 +573,7 @@ "# The [0] gives us the corpus\n", "# [1] is clean strings for BERT\n", "# [2] the indexes of selected entries if sample_size != 1\n", - "other_airlines_corpus = prepare_text_data(\n", + "other_airlines_corpus = prepare_data(\n", " data=df_other_airlines,\n", " target_cols='text',\n", " input_language=input_language, \n", @@ -476,7 +642,7 @@ "metadata": {}, "outputs": [], "source": [ - "topic_nums_to_compare = list(range(5,15))" + "topic_nums_to_compare = list(range(5, 16))" ] }, { @@ -490,7 +656,7 @@ " text_corpus=text_corpus, \n", " clean_texts=None,\n", " input_language=input_language,\n", - " num_keywords=10,\n", + " num_keywords=num_keywords,\n", " topic_nums_to_compare=topic_nums_to_compare,\n", " min_freq=2,\n", " min_word_len=4,\n", @@ -528,7 +694,7 @@ "# method='lda',\n", "# text_corpus=text_corpus, \n", "# input_language=input_language,\n", - "# num_topics=10,\n", + "# num_topics=num_topics,\n", "# min_freq=2,\n", "# min_word_len=4,\n", "# save_file=False,\n", @@ -586,7 +752,7 @@ "t_sne(\n", " dimension=\"both\", \n", " text_corpus=text_corpus, \n", - " num_topics=10, \n", + " num_topics=num_topics, \n", " remove_3d_outliers=True,\n", " fig_size=(20, 10),\n", " save_file=False,\n", @@ -633,7 +799,7 @@ " clean_texts=None,\n", " input_language=input_language,\n", " output_language=None,\n", - " num_keywords=15,\n", + " num_keywords=num_keywords,\n", " topic_nums_to_compare=topic_nums_to_compare,\n", " corpuses_to_compare=None,\n", " ignore_words=None,\n",