From a342c01b17d963ef5c73754edc3686077c6690d5 Mon Sep 17 00:00:00 2001
From: Andrew Tavis <andrew.t.mcallister@gmail.com>
Date: Mon, 15 Mar 2021 14:46:23 +0100
Subject: [PATCH] Example and readme updates and version up

---
 README.md                    |   9 +-
 docs/source/conf.py          |   2 +-
 examples/kw_extraction.ipynb | 195 +++++++++++++++--------------------
 setup.py                     |   2 +-
 4 files changed, 94 insertions(+), 114 deletions(-)

diff --git a/README.md b/README.md
index 80a216f..f2547b9 100644
--- a/README.md
+++ b/README.md
@@ -72,9 +72,10 @@ Keyword extraction can be useful to analyze surveys, tweets and other kinds of s
 
 The following outlines using kwx to derive keywords from a text corpus with `prompt_remove_words` as `True` (the user will be asked if some of the extracted words need to be replaced):
 
+### Text Cleaning
+
 ```python
 from kwx.utils import prepare_data
-from kwx.model import extract_kws
 
 input_language = "english" # see kwx.languages for options
 num_keywords = 15
@@ -91,6 +92,12 @@ text_corpus = prepare_data(
     remove_stopwords=False,  # for BERT
     verbose=True,
 )
+```
+
+### Keyword Extraction
+
+```python
+from kwx.model import extract_kws
 
 # Remove n-grams for BERT training
 corpus_no_ngrams = [
diff --git a/docs/source/conf.py b/docs/source/conf.py
index c098608..72db29b 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -24,7 +24,7 @@
 author = "kwx developers"
 
 # The full version, including alpha/beta/rc tags
-release = "0.1.4"
+release = "0.1.5"
 
 
 # -- General configuration ---------------------------------------------------
diff --git a/examples/kw_extraction.ipynb b/examples/kw_extraction.ipynb
index 8c4c7e7..3ac0189 100644
--- a/examples/kw_extraction.ipynb
+++ b/examples/kw_extraction.ipynb
@@ -195,17 +195,15 @@
    },
    "outputs": [],
    "source": [
-    "# The [0] gives us the corpus\n",
-    "# [1] is clean strings for BERT\n",
-    "# [2] the indexes of selected entries if sample_size != 1\n",
     "text_corpus = prepare_data(\n",
     "    data=df_airline_tweets,\n",
     "    target_cols='text',\n",
     "    input_language=input_language, \n",
-    "    min_freq=2,\n",
-    "    min_word_len=4,\n",
-    "    sample_size=1,\n",
-    ")[0]"
+    "    min_freq=2,  # 0 for BERT\n",
+    "    min_word_len=4,  # 0 for BERT\n",
+    "    remove_stopwords=True,  # False for BERT\n",
+    "    verbose=False,\n",
+    ")"
    ]
   },
   {
@@ -285,8 +283,8 @@
     "# return_topics=True gives us the topics themselves\n",
     "topics = extract_kws(\n",
     "    method='LDA',\n",
+    "    bert_st_model=None,\n",
     "    text_corpus=text_corpus,\n",
-    "    clean_texts=None,\n",
     "    input_language=input_language,\n",
     "    output_language=None,\n",
     "    num_keywords=num_keywords,\n",
@@ -294,9 +292,6 @@
     "    corpuses_to_compare=None,\n",
     "    return_topics=True,\n",
     "    ignore_words=None,\n",
-    "    min_freq=2,\n",
-    "    min_word_len=4,\n",
-    "    sample_size=1,\n",
     "    prompt_remove_words=False,\n",
     ")"
    ]
@@ -361,8 +356,7 @@
    },
    "outputs": [],
    "source": [
-    "# The following is a string or list of strings to not include in outputs\n",
-    "# This variable is updated by the user if prompt_remove_words=True\n",
+    "# A string or list of strings to not include in outputs\n",
     "ignore_words = None"
    ]
   },
@@ -383,18 +377,14 @@
    "source": [
     "freq_kws = extract_kws(\n",
     "    method='frequency',\n",
+    "    bert_st_model=None,\n",
     "    text_corpus=text_corpus,\n",
-    "    clean_texts=None,\n",
     "    input_language=input_language,\n",
     "    output_language=None,\n",
     "    num_keywords=num_keywords,\n",
     "    num_topics=num_topics,\n",
     "    corpuses_to_compare=None,\n",
-    "    return_topics=False,\n",
     "    ignore_words=None,\n",
-    "    min_freq=2,\n",
-    "    min_word_len=4,\n",
-    "    sample_size=1,\n",
     "    prompt_remove_words=False\n",
     ")"
    ]
@@ -472,21 +462,20 @@
     }
    ],
    "source": [
+    "# We can pass keywords for gensim.models.ldamulticore.LdaMulticore\n",
     "lda_kws = extract_kws(\n",
     "    method='LDA',\n",
+    "    bert_st_model=None,\n",
     "    text_corpus=text_corpus,\n",
-    "    clean_texts=None,\n",
     "    input_language=input_language,\n",
     "    output_language=None,\n",
     "    num_keywords=num_keywords,\n",
     "    num_topics=num_topics,\n",
-    "    corpuses_to_compare=None,\n",
-    "    return_topics=False,\n",
     "    ignore_words=None,\n",
-    "    min_freq=2,\n",
-    "    min_word_len=4,\n",
-    "    sample_size=1,\n",
-    "    prompt_remove_words=True\n",
+    "    prompt_remove_words=True,\n",
+    "    passes=20,\n",
+    "    eval_every=5,\n",
+    "    decay=0.5,\n",
     ")"
    ]
   },
@@ -536,26 +525,23 @@
    "outputs": [],
    "source": [
     "# Remove n-grams for BERT training\n",
-    "# Clean texts without n-grams has been found to be better than raw texts for BERT\n",
-    "# corpus_no_ngrams = [\n",
-    "#     \" \".join([t for t in text.split(\" \") if \"_\" not in t]) for text in text_corpus\n",
-    "# ]\n",
+    "corpus_no_ngrams = [\n",
+    "    \" \".join([t for t in text.split(\" \") if \"_\" not in t]) for text in text_corpus\n",
+    "]\n",
     "\n",
-    "# bert_kws = extract_kws(\n",
-    "#     method='BERT',\n",
-    "#     text_corpus=corpus_no_ngrams,\n",
-    "#     input_language=input_language,\n",
-    "#     output_language=None,\n",
-    "#     num_keywords=num_keywords,\n",
-    "#     num_topics=num_topics,\n",
-    "#     corpuses_to_compare=None,\n",
-    "#     return_topics=False,\n",
-    "#     ignore_words=ignore_words,\n",
-    "#     min_freq=2,\n",
-    "#     min_word_len=4,\n",
-    "#     sample_size=1,\n",
-    "#     prompt_remove_words=True,\n",
-    "# )"
+    "# We can pass keywords for sentence_transformers.SentenceTransformer.encode\n",
+    "bert_kws = extract_kws(\n",
+    "    method='BERT',\n",
+    "    bert_st_model=\"xlm-r-bert-base-nli-stsb-mean-tokens\",\n",
+    "    text_corpus=corpus_no_ngrams,\n",
+    "    input_language=input_language,\n",
+    "    output_language=None,\n",
+    "    num_keywords=num_keywords,\n",
+    "    num_topics=num_topics,\n",
+    "    ignore_words=ignore_words,\n",
+    "    prompt_remove_words=True,\n",
+    "    batch_size=32,\n",
+    ")"
    ]
   },
   {
@@ -576,11 +562,11 @@
    },
    "outputs": [],
    "source": [
-    "# translate_output(\n",
-    "#     outputs=lda_kws, \n",
-    "#     input_language=input_language, \n",
-    "#     output_language='spanish'\n",
-    "# )"
+    "translate_output(\n",
+    "    outputs=lda_kws, \n",
+    "    input_language=input_language, \n",
+    "    output_language='spanish'\n",
+    ")"
    ]
   },
   {
@@ -660,17 +646,15 @@
    },
    "outputs": [],
    "source": [
-    "# The [0] gives us the corpus\n",
-    "# [1] is clean strings for BERT\n",
-    "# [2] the indexes of selected entries if sample_size != 1\n",
     "united_corpus = prepare_data(\n",
     "    data=df_united,\n",
     "    target_cols='text',\n",
     "    input_language=input_language, \n",
-    "    min_freq=2,\n",
-    "    min_word_len=4,\n",
-    "    sample_size=1,\n",
-    ")[0]"
+    "    min_freq=2,  # 0 for BERT\n",
+    "    min_word_len=4,  # 0 for BERT\n",
+    "    remove_stopwords=True,  # False for BERT\n",
+    "    verbose=False,\n",
+    ")"
    ]
   },
   {
@@ -700,17 +684,15 @@
    },
    "outputs": [],
    "source": [
-    "# The [0] gives us the corpus\n",
-    "# [1] is clean strings for BERT\n",
-    "# [2] the indexes of selected entries if sample_size != 1\n",
     "other_airlines_corpus = prepare_data(\n",
     "    data=df_other_airlines,\n",
     "    target_cols='text',\n",
     "    input_language=input_language, \n",
-    "    min_freq=2,\n",
-    "    min_word_len=4,\n",
-    "    sample_size=1,\n",
-    ")[0]"
+    "    min_freq=2,  # 0 for BERT\n",
+    "    min_word_len=4,  # 0 for BERT\n",
+    "    remove_stopwords=True,  # False for BERT\n",
+    "    verbose=False,\n",
+    ")"
    ]
   },
   {
@@ -725,21 +707,21 @@
    "outputs": [],
    "source": [
     "# Words that are prevalent in United tweets compared to others\n",
+    "# We use the corpuses_to_compare argument to compare prevalent words\n",
+    "\n",
+    "# We can pass keywords for sklearn.feature_extraction.text.TfidfVectorize\n",
     "tfidf_kws = extract_kws(\n",
     "   method='tfidf',\n",
     "   text_corpus=united_corpus,\n",
-    "   clean_texts=None,\n",
     "   input_language=input_language,\n",
     "   output_language=None,\n",
     "   num_keywords=10,\n",
     "   num_topics=10,\n",
     "   corpuses_to_compare=other_airlines_corpus,\n",
-    "   return_topics=False,\n",
     "   ignore_words=ignore_words,\n",
-    "   min_freq=2,\n",
-    "   min_word_len=4,\n",
-    "   sample_size=1,\n",
     "   prompt_remove_words=False,\n",
+    "   use_idf=True, \n",
+    "   smooth_idf=True,\n",
     ")"
    ]
   },
@@ -816,20 +798,18 @@
    },
    "outputs": [],
    "source": [
-    "# Commented out to avoid long run times\n",
-    "# figure = graph_topic_num_evals(\n",
-    "#     method=['lda', 'bert', 'lda_bert'],\n",
-    "#     text_corpus=text_corpus, \n",
-    "#     input_language=input_language,\n",
-    "#     num_keywords=num_keywords,\n",
-    "#     topic_nums_to_compare=topic_nums_to_compare,\n",
-    "#     sample_size=1,\n",
-    "#     metrics=True,  # stability and coherence\n",
-    "#     save_file=False, # True for pwd or directory name\n",
-    "#     return_ideal_metrics=False, # don't  output ideal model instead of plot\n",
-    "#     verbose=False,  # so progress bar isn't broken online\n",
-    "# )\n",
-    "# plt.show()"
+    "figure = graph_topic_num_evals(\n",
+    "    method=['lda', 'bert', 'lda_bert'],\n",
+    "    text_corpus=text_corpus, \n",
+    "    input_language=input_language,\n",
+    "    num_keywords=num_keywords,\n",
+    "    topic_nums_to_compare=topic_nums_to_compare,\n",
+    "    metrics=True,  # stability and coherence\n",
+    "    save_file=False, # True for pwd or directory name\n",
+    "    verbose=False,  # so progress bar isn't broken online\n",
+    ")\n",
+    "\n",
+    "plt.show()"
    ]
   },
   {
@@ -850,15 +830,13 @@
    },
    "outputs": [],
    "source": [
-    "# Commented out as it changes the output dimensions due to its width\n",
-    "# pyLDAvis_topics(\n",
-    "#     method='lda',\n",
-    "#     text_corpus=text_corpus, \n",
-    "#     input_language=input_language,\n",
-    "#     num_topics=num_topics,\n",
-    "#     save_file=False, # True for pwd or directory name\n",
-    "#     display_ipython=True,  # <- show in Jupyter notebook\n",
-    "# )"
+    "pyLDAvis_topics(\n",
+    "    method='lda',\n",
+    "    text_corpus=text_corpus, \n",
+    "    num_topics=num_topics,\n",
+    "    save_file=False, # True for pwd or directory name\n",
+    "    display_ipython=True,  # <- show in Jupyter notebook\n",
+    ")"
    ]
   },
   {
@@ -916,7 +894,6 @@
    "source": [
     "gen_word_cloud(\n",
     "    text_corpus=text_corpus,\n",
-    "    input_language=input_language,\n",
     "    ignore_words=ignore_words,\n",
     "    height=500,\n",
     "    save_file=False, # True for pwd or directory name\n",
@@ -999,24 +976,20 @@
    },
    "outputs": [],
    "source": [
-    "# Commented out to avoid long run times\n",
-    "# gen_files(\n",
-    "#     method=['lda', 'bert', 'lda_bert'],\n",
-    "#     text_corpus=text_corpus, \n",
-    "#     input_language=input_language,\n",
-    "#     output_language=None,\n",
-    "#     num_keywords=num_keywords,\n",
-    "#     topic_nums_to_compare=topic_nums_to_compare,\n",
-    "#     ignore_words=ignore_words,\n",
-    "#     min_freq=2,\n",
-    "#     min_word_len=4,\n",
-    "#     sample_size=1,\n",
-    "#     prompt_remove_words=True,\n",
-    "#     verbose=False,  # so progress bar isn't broken online\n",
-    "#     org_by_pos=False,  # organize keywords by part of speech\n",
-    "#     incl_visuals=['topic_num_evals', 'word_cloud', 'pyLDAvis'],  # t_sne not zipping properly\n",
-    "#     zip_results=True,\n",
-    "# )"
+    "gen_files(\n",
+    "    method=['lda', 'bert', 'lda_bert'],\n",
+    "    text_corpus=text_corpus, \n",
+    "    input_language=input_language,\n",
+    "    output_language=None,\n",
+    "    num_keywords=num_keywords,\n",
+    "    topic_nums_to_compare=topic_nums_to_compare,\n",
+    "    ignore_words=ignore_words,\n",
+    "    prompt_remove_words=True,\n",
+    "    verbose=True,\n",
+    "    org_by_pos=False,  # organize keywords by part of speech\n",
+    "    incl_visuals=['topic_num_evals', 'word_cloud', 'pyLDAvis'],  # t_sne not zipping properly\n",
+    "    zip_results=True,\n",
+    ")"
    ]
   },
   {
@@ -1043,7 +1016,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.5"
+   "version": "3.7.9"
   },
   "toc": {
    "base_numbering": 1,
diff --git a/setup.py b/setup.py
index 7f4fae2..da3017a 100644
--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,7 @@
 
 setup_args = dict(
     name="kwx",
-    version="0.1.4",
+    version="0.1.5",
     author="Andrew Tavis McAllister",
     author_email="andrew.t.mcallister@gmail.com",
     classifiers=[