From a342c01b17d963ef5c73754edc3686077c6690d5 Mon Sep 17 00:00:00 2001 From: Andrew Tavis Date: Mon, 15 Mar 2021 14:46:23 +0100 Subject: [PATCH] Example and readme updates and version up --- README.md | 9 +- docs/source/conf.py | 2 +- examples/kw_extraction.ipynb | 195 +++++++++++++++-------------------- setup.py | 2 +- 4 files changed, 94 insertions(+), 114 deletions(-) diff --git a/README.md b/README.md index 80a216f..f2547b9 100644 --- a/README.md +++ b/README.md @@ -72,9 +72,10 @@ Keyword extraction can be useful to analyze surveys, tweets and other kinds of s The following outlines using kwx to derive keywords from a text corpus with `prompt_remove_words` as `True` (the user will be asked if some of the extracted words need to be replaced): +### Text Cleaning + ```python from kwx.utils import prepare_data -from kwx.model import extract_kws input_language = "english" # see kwx.languages for options num_keywords = 15 @@ -91,6 +92,12 @@ text_corpus = prepare_data( remove_stopwords=False, # for BERT verbose=True, ) +``` + +### Keyword Extraction + +```python +from kwx.model import extract_kws # Remove n-grams for BERT training corpus_no_ngrams = [ diff --git a/docs/source/conf.py b/docs/source/conf.py index c098608..72db29b 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -24,7 +24,7 @@ author = "kwx developers" # The full version, including alpha/beta/rc tags -release = "0.1.4" +release = "0.1.5" # -- General configuration --------------------------------------------------- diff --git a/examples/kw_extraction.ipynb b/examples/kw_extraction.ipynb index 8c4c7e7..3ac0189 100644 --- a/examples/kw_extraction.ipynb +++ b/examples/kw_extraction.ipynb @@ -195,17 +195,15 @@ }, "outputs": [], "source": [ - "# The [0] gives us the corpus\n", - "# [1] is clean strings for BERT\n", - "# [2] the indexes of selected entries if sample_size != 1\n", "text_corpus = prepare_data(\n", " data=df_airline_tweets,\n", " target_cols='text',\n", " input_language=input_language, \n", - " min_freq=2,\n", - " min_word_len=4,\n", - " sample_size=1,\n", - ")[0]" + " min_freq=2, # 0 for BERT\n", + " min_word_len=4, # 0 for BERT\n", + " remove_stopwords=True, # False for BERT\n", + " verbose=False,\n", + ")" ] }, { @@ -285,8 +283,8 @@ "# return_topics=True gives us the topics themselves\n", "topics = extract_kws(\n", " method='LDA',\n", + " bert_st_model=None,\n", " text_corpus=text_corpus,\n", - " clean_texts=None,\n", " input_language=input_language,\n", " output_language=None,\n", " num_keywords=num_keywords,\n", @@ -294,9 +292,6 @@ " corpuses_to_compare=None,\n", " return_topics=True,\n", " ignore_words=None,\n", - " min_freq=2,\n", - " min_word_len=4,\n", - " sample_size=1,\n", " prompt_remove_words=False,\n", ")" ] @@ -361,8 +356,7 @@ }, "outputs": [], "source": [ - "# The following is a string or list of strings to not include in outputs\n", - "# This variable is updated by the user if prompt_remove_words=True\n", + "# A string or list of strings to not include in outputs\n", "ignore_words = None" ] }, @@ -383,18 +377,14 @@ "source": [ "freq_kws = extract_kws(\n", " method='frequency',\n", + " bert_st_model=None,\n", " text_corpus=text_corpus,\n", - " clean_texts=None,\n", " input_language=input_language,\n", " output_language=None,\n", " num_keywords=num_keywords,\n", " num_topics=num_topics,\n", " corpuses_to_compare=None,\n", - " return_topics=False,\n", " ignore_words=None,\n", - " min_freq=2,\n", - " min_word_len=4,\n", - " sample_size=1,\n", " prompt_remove_words=False\n", ")" ] @@ -472,21 +462,20 @@ } ], "source": [ + "# We can pass keywords for gensim.models.ldamulticore.LdaMulticore\n", "lda_kws = extract_kws(\n", " method='LDA',\n", + " bert_st_model=None,\n", " text_corpus=text_corpus,\n", - " clean_texts=None,\n", " input_language=input_language,\n", " output_language=None,\n", " num_keywords=num_keywords,\n", " num_topics=num_topics,\n", - " corpuses_to_compare=None,\n", - " return_topics=False,\n", " ignore_words=None,\n", - " min_freq=2,\n", - " min_word_len=4,\n", - " sample_size=1,\n", - " prompt_remove_words=True\n", + " prompt_remove_words=True,\n", + " passes=20,\n", + " eval_every=5,\n", + " decay=0.5,\n", ")" ] }, @@ -536,26 +525,23 @@ "outputs": [], "source": [ "# Remove n-grams for BERT training\n", - "# Clean texts without n-grams has been found to be better than raw texts for BERT\n", - "# corpus_no_ngrams = [\n", - "# \" \".join([t for t in text.split(\" \") if \"_\" not in t]) for text in text_corpus\n", - "# ]\n", + "corpus_no_ngrams = [\n", + " \" \".join([t for t in text.split(\" \") if \"_\" not in t]) for text in text_corpus\n", + "]\n", "\n", - "# bert_kws = extract_kws(\n", - "# method='BERT',\n", - "# text_corpus=corpus_no_ngrams,\n", - "# input_language=input_language,\n", - "# output_language=None,\n", - "# num_keywords=num_keywords,\n", - "# num_topics=num_topics,\n", - "# corpuses_to_compare=None,\n", - "# return_topics=False,\n", - "# ignore_words=ignore_words,\n", - "# min_freq=2,\n", - "# min_word_len=4,\n", - "# sample_size=1,\n", - "# prompt_remove_words=True,\n", - "# )" + "# We can pass keywords for sentence_transformers.SentenceTransformer.encode\n", + "bert_kws = extract_kws(\n", + " method='BERT',\n", + " bert_st_model=\"xlm-r-bert-base-nli-stsb-mean-tokens\",\n", + " text_corpus=corpus_no_ngrams,\n", + " input_language=input_language,\n", + " output_language=None,\n", + " num_keywords=num_keywords,\n", + " num_topics=num_topics,\n", + " ignore_words=ignore_words,\n", + " prompt_remove_words=True,\n", + " batch_size=32,\n", + ")" ] }, { @@ -576,11 +562,11 @@ }, "outputs": [], "source": [ - "# translate_output(\n", - "# outputs=lda_kws, \n", - "# input_language=input_language, \n", - "# output_language='spanish'\n", - "# )" + "translate_output(\n", + " outputs=lda_kws, \n", + " input_language=input_language, \n", + " output_language='spanish'\n", + ")" ] }, { @@ -660,17 +646,15 @@ }, "outputs": [], "source": [ - "# The [0] gives us the corpus\n", - "# [1] is clean strings for BERT\n", - "# [2] the indexes of selected entries if sample_size != 1\n", "united_corpus = prepare_data(\n", " data=df_united,\n", " target_cols='text',\n", " input_language=input_language, \n", - " min_freq=2,\n", - " min_word_len=4,\n", - " sample_size=1,\n", - ")[0]" + " min_freq=2, # 0 for BERT\n", + " min_word_len=4, # 0 for BERT\n", + " remove_stopwords=True, # False for BERT\n", + " verbose=False,\n", + ")" ] }, { @@ -700,17 +684,15 @@ }, "outputs": [], "source": [ - "# The [0] gives us the corpus\n", - "# [1] is clean strings for BERT\n", - "# [2] the indexes of selected entries if sample_size != 1\n", "other_airlines_corpus = prepare_data(\n", " data=df_other_airlines,\n", " target_cols='text',\n", " input_language=input_language, \n", - " min_freq=2,\n", - " min_word_len=4,\n", - " sample_size=1,\n", - ")[0]" + " min_freq=2, # 0 for BERT\n", + " min_word_len=4, # 0 for BERT\n", + " remove_stopwords=True, # False for BERT\n", + " verbose=False,\n", + ")" ] }, { @@ -725,21 +707,21 @@ "outputs": [], "source": [ "# Words that are prevalent in United tweets compared to others\n", + "# We use the corpuses_to_compare argument to compare prevalent words\n", + "\n", + "# We can pass keywords for sklearn.feature_extraction.text.TfidfVectorize\n", "tfidf_kws = extract_kws(\n", " method='tfidf',\n", " text_corpus=united_corpus,\n", - " clean_texts=None,\n", " input_language=input_language,\n", " output_language=None,\n", " num_keywords=10,\n", " num_topics=10,\n", " corpuses_to_compare=other_airlines_corpus,\n", - " return_topics=False,\n", " ignore_words=ignore_words,\n", - " min_freq=2,\n", - " min_word_len=4,\n", - " sample_size=1,\n", " prompt_remove_words=False,\n", + " use_idf=True, \n", + " smooth_idf=True,\n", ")" ] }, @@ -816,20 +798,18 @@ }, "outputs": [], "source": [ - "# Commented out to avoid long run times\n", - "# figure = graph_topic_num_evals(\n", - "# method=['lda', 'bert', 'lda_bert'],\n", - "# text_corpus=text_corpus, \n", - "# input_language=input_language,\n", - "# num_keywords=num_keywords,\n", - "# topic_nums_to_compare=topic_nums_to_compare,\n", - "# sample_size=1,\n", - "# metrics=True, # stability and coherence\n", - "# save_file=False, # True for pwd or directory name\n", - "# return_ideal_metrics=False, # don't output ideal model instead of plot\n", - "# verbose=False, # so progress bar isn't broken online\n", - "# )\n", - "# plt.show()" + "figure = graph_topic_num_evals(\n", + " method=['lda', 'bert', 'lda_bert'],\n", + " text_corpus=text_corpus, \n", + " input_language=input_language,\n", + " num_keywords=num_keywords,\n", + " topic_nums_to_compare=topic_nums_to_compare,\n", + " metrics=True, # stability and coherence\n", + " save_file=False, # True for pwd or directory name\n", + " verbose=False, # so progress bar isn't broken online\n", + ")\n", + "\n", + "plt.show()" ] }, { @@ -850,15 +830,13 @@ }, "outputs": [], "source": [ - "# Commented out as it changes the output dimensions due to its width\n", - "# pyLDAvis_topics(\n", - "# method='lda',\n", - "# text_corpus=text_corpus, \n", - "# input_language=input_language,\n", - "# num_topics=num_topics,\n", - "# save_file=False, # True for pwd or directory name\n", - "# display_ipython=True, # <- show in Jupyter notebook\n", - "# )" + "pyLDAvis_topics(\n", + " method='lda',\n", + " text_corpus=text_corpus, \n", + " num_topics=num_topics,\n", + " save_file=False, # True for pwd or directory name\n", + " display_ipython=True, # <- show in Jupyter notebook\n", + ")" ] }, { @@ -916,7 +894,6 @@ "source": [ "gen_word_cloud(\n", " text_corpus=text_corpus,\n", - " input_language=input_language,\n", " ignore_words=ignore_words,\n", " height=500,\n", " save_file=False, # True for pwd or directory name\n", @@ -999,24 +976,20 @@ }, "outputs": [], "source": [ - "# Commented out to avoid long run times\n", - "# gen_files(\n", - "# method=['lda', 'bert', 'lda_bert'],\n", - "# text_corpus=text_corpus, \n", - "# input_language=input_language,\n", - "# output_language=None,\n", - "# num_keywords=num_keywords,\n", - "# topic_nums_to_compare=topic_nums_to_compare,\n", - "# ignore_words=ignore_words,\n", - "# min_freq=2,\n", - "# min_word_len=4,\n", - "# sample_size=1,\n", - "# prompt_remove_words=True,\n", - "# verbose=False, # so progress bar isn't broken online\n", - "# org_by_pos=False, # organize keywords by part of speech\n", - "# incl_visuals=['topic_num_evals', 'word_cloud', 'pyLDAvis'], # t_sne not zipping properly\n", - "# zip_results=True,\n", - "# )" + "gen_files(\n", + " method=['lda', 'bert', 'lda_bert'],\n", + " text_corpus=text_corpus, \n", + " input_language=input_language,\n", + " output_language=None,\n", + " num_keywords=num_keywords,\n", + " topic_nums_to_compare=topic_nums_to_compare,\n", + " ignore_words=ignore_words,\n", + " prompt_remove_words=True,\n", + " verbose=True,\n", + " org_by_pos=False, # organize keywords by part of speech\n", + " incl_visuals=['topic_num_evals', 'word_cloud', 'pyLDAvis'], # t_sne not zipping properly\n", + " zip_results=True,\n", + ")" ] }, { @@ -1043,7 +1016,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.5" + "version": "3.7.9" }, "toc": { "base_numbering": 1, diff --git a/setup.py b/setup.py index 7f4fae2..da3017a 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup_args = dict( name="kwx", - version="0.1.4", + version="0.1.5", author="Andrew Tavis McAllister", author_email="andrew.t.mcallister@gmail.com", classifiers=[