Skip to content

Commit

Permalink
Example and readme updates and version up
Browse files Browse the repository at this point in the history
  • Loading branch information
andrewtavis committed Mar 15, 2021
1 parent a58255f commit a342c01
Show file tree
Hide file tree
Showing 4 changed files with 94 additions and 114 deletions.
9 changes: 8 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,9 +72,10 @@ Keyword extraction can be useful to analyze surveys, tweets and other kinds of s

The following outlines using kwx to derive keywords from a text corpus with `prompt_remove_words` as `True` (the user will be asked if some of the extracted words need to be replaced):

### Text Cleaning

```python
from kwx.utils import prepare_data
from kwx.model import extract_kws

input_language = "english" # see kwx.languages for options
num_keywords = 15
Expand All @@ -91,6 +92,12 @@ text_corpus = prepare_data(
remove_stopwords=False, # for BERT
verbose=True,
)
```

### Keyword Extraction

```python
from kwx.model import extract_kws

# Remove n-grams for BERT training
corpus_no_ngrams = [
Expand Down
2 changes: 1 addition & 1 deletion docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
author = "kwx developers"

# The full version, including alpha/beta/rc tags
release = "0.1.4"
release = "0.1.5"


# -- General configuration ---------------------------------------------------
Expand Down
195 changes: 84 additions & 111 deletions examples/kw_extraction.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -195,17 +195,15 @@
},
"outputs": [],
"source": [
"# The [0] gives us the corpus\n",
"# [1] is clean strings for BERT\n",
"# [2] the indexes of selected entries if sample_size != 1\n",
"text_corpus = prepare_data(\n",
" data=df_airline_tweets,\n",
" target_cols='text',\n",
" input_language=input_language, \n",
" min_freq=2,\n",
" min_word_len=4,\n",
" sample_size=1,\n",
")[0]"
" min_freq=2, # 0 for BERT\n",
" min_word_len=4, # 0 for BERT\n",
" remove_stopwords=True, # False for BERT\n",
" verbose=False,\n",
")"
]
},
{
Expand Down Expand Up @@ -285,18 +283,15 @@
"# return_topics=True gives us the topics themselves\n",
"topics = extract_kws(\n",
" method='LDA',\n",
" bert_st_model=None,\n",
" text_corpus=text_corpus,\n",
" clean_texts=None,\n",
" input_language=input_language,\n",
" output_language=None,\n",
" num_keywords=num_keywords,\n",
" num_topics=num_topics,\n",
" corpuses_to_compare=None,\n",
" return_topics=True,\n",
" ignore_words=None,\n",
" min_freq=2,\n",
" min_word_len=4,\n",
" sample_size=1,\n",
" prompt_remove_words=False,\n",
")"
]
Expand Down Expand Up @@ -361,8 +356,7 @@
},
"outputs": [],
"source": [
"# The following is a string or list of strings to not include in outputs\n",
"# This variable is updated by the user if prompt_remove_words=True\n",
"# A string or list of strings to not include in outputs\n",
"ignore_words = None"
]
},
Expand All @@ -383,18 +377,14 @@
"source": [
"freq_kws = extract_kws(\n",
" method='frequency',\n",
" bert_st_model=None,\n",
" text_corpus=text_corpus,\n",
" clean_texts=None,\n",
" input_language=input_language,\n",
" output_language=None,\n",
" num_keywords=num_keywords,\n",
" num_topics=num_topics,\n",
" corpuses_to_compare=None,\n",
" return_topics=False,\n",
" ignore_words=None,\n",
" min_freq=2,\n",
" min_word_len=4,\n",
" sample_size=1,\n",
" prompt_remove_words=False\n",
")"
]
Expand Down Expand Up @@ -472,21 +462,20 @@
}
],
"source": [
"# We can pass keywords for gensim.models.ldamulticore.LdaMulticore\n",
"lda_kws = extract_kws(\n",
" method='LDA',\n",
" bert_st_model=None,\n",
" text_corpus=text_corpus,\n",
" clean_texts=None,\n",
" input_language=input_language,\n",
" output_language=None,\n",
" num_keywords=num_keywords,\n",
" num_topics=num_topics,\n",
" corpuses_to_compare=None,\n",
" return_topics=False,\n",
" ignore_words=None,\n",
" min_freq=2,\n",
" min_word_len=4,\n",
" sample_size=1,\n",
" prompt_remove_words=True\n",
" prompt_remove_words=True,\n",
" passes=20,\n",
" eval_every=5,\n",
" decay=0.5,\n",
")"
]
},
Expand Down Expand Up @@ -536,26 +525,23 @@
"outputs": [],
"source": [
"# Remove n-grams for BERT training\n",
"# Clean texts without n-grams has been found to be better than raw texts for BERT\n",
"# corpus_no_ngrams = [\n",
"# \" \".join([t for t in text.split(\" \") if \"_\" not in t]) for text in text_corpus\n",
"# ]\n",
"corpus_no_ngrams = [\n",
" \" \".join([t for t in text.split(\" \") if \"_\" not in t]) for text in text_corpus\n",
"]\n",
"\n",
"# bert_kws = extract_kws(\n",
"# method='BERT',\n",
"# text_corpus=corpus_no_ngrams,\n",
"# input_language=input_language,\n",
"# output_language=None,\n",
"# num_keywords=num_keywords,\n",
"# num_topics=num_topics,\n",
"# corpuses_to_compare=None,\n",
"# return_topics=False,\n",
"# ignore_words=ignore_words,\n",
"# min_freq=2,\n",
"# min_word_len=4,\n",
"# sample_size=1,\n",
"# prompt_remove_words=True,\n",
"# )"
"# We can pass keywords for sentence_transformers.SentenceTransformer.encode\n",
"bert_kws = extract_kws(\n",
" method='BERT',\n",
" bert_st_model=\"xlm-r-bert-base-nli-stsb-mean-tokens\",\n",
" text_corpus=corpus_no_ngrams,\n",
" input_language=input_language,\n",
" output_language=None,\n",
" num_keywords=num_keywords,\n",
" num_topics=num_topics,\n",
" ignore_words=ignore_words,\n",
" prompt_remove_words=True,\n",
" batch_size=32,\n",
")"
]
},
{
Expand All @@ -576,11 +562,11 @@
},
"outputs": [],
"source": [
"# translate_output(\n",
"# outputs=lda_kws, \n",
"# input_language=input_language, \n",
"# output_language='spanish'\n",
"# )"
"translate_output(\n",
" outputs=lda_kws, \n",
" input_language=input_language, \n",
" output_language='spanish'\n",
")"
]
},
{
Expand Down Expand Up @@ -660,17 +646,15 @@
},
"outputs": [],
"source": [
"# The [0] gives us the corpus\n",
"# [1] is clean strings for BERT\n",
"# [2] the indexes of selected entries if sample_size != 1\n",
"united_corpus = prepare_data(\n",
" data=df_united,\n",
" target_cols='text',\n",
" input_language=input_language, \n",
" min_freq=2,\n",
" min_word_len=4,\n",
" sample_size=1,\n",
")[0]"
" min_freq=2, # 0 for BERT\n",
" min_word_len=4, # 0 for BERT\n",
" remove_stopwords=True, # False for BERT\n",
" verbose=False,\n",
")"
]
},
{
Expand Down Expand Up @@ -700,17 +684,15 @@
},
"outputs": [],
"source": [
"# The [0] gives us the corpus\n",
"# [1] is clean strings for BERT\n",
"# [2] the indexes of selected entries if sample_size != 1\n",
"other_airlines_corpus = prepare_data(\n",
" data=df_other_airlines,\n",
" target_cols='text',\n",
" input_language=input_language, \n",
" min_freq=2,\n",
" min_word_len=4,\n",
" sample_size=1,\n",
")[0]"
" min_freq=2, # 0 for BERT\n",
" min_word_len=4, # 0 for BERT\n",
" remove_stopwords=True, # False for BERT\n",
" verbose=False,\n",
")"
]
},
{
Expand All @@ -725,21 +707,21 @@
"outputs": [],
"source": [
"# Words that are prevalent in United tweets compared to others\n",
"# We use the corpuses_to_compare argument to compare prevalent words\n",
"\n",
"# We can pass keywords for sklearn.feature_extraction.text.TfidfVectorize\n",
"tfidf_kws = extract_kws(\n",
" method='tfidf',\n",
" text_corpus=united_corpus,\n",
" clean_texts=None,\n",
" input_language=input_language,\n",
" output_language=None,\n",
" num_keywords=10,\n",
" num_topics=10,\n",
" corpuses_to_compare=other_airlines_corpus,\n",
" return_topics=False,\n",
" ignore_words=ignore_words,\n",
" min_freq=2,\n",
" min_word_len=4,\n",
" sample_size=1,\n",
" prompt_remove_words=False,\n",
" use_idf=True, \n",
" smooth_idf=True,\n",
")"
]
},
Expand Down Expand Up @@ -816,20 +798,18 @@
},
"outputs": [],
"source": [
"# Commented out to avoid long run times\n",
"# figure = graph_topic_num_evals(\n",
"# method=['lda', 'bert', 'lda_bert'],\n",
"# text_corpus=text_corpus, \n",
"# input_language=input_language,\n",
"# num_keywords=num_keywords,\n",
"# topic_nums_to_compare=topic_nums_to_compare,\n",
"# sample_size=1,\n",
"# metrics=True, # stability and coherence\n",
"# save_file=False, # True for pwd or directory name\n",
"# return_ideal_metrics=False, # don't output ideal model instead of plot\n",
"# verbose=False, # so progress bar isn't broken online\n",
"# )\n",
"# plt.show()"
"figure = graph_topic_num_evals(\n",
" method=['lda', 'bert', 'lda_bert'],\n",
" text_corpus=text_corpus, \n",
" input_language=input_language,\n",
" num_keywords=num_keywords,\n",
" topic_nums_to_compare=topic_nums_to_compare,\n",
" metrics=True, # stability and coherence\n",
" save_file=False, # True for pwd or directory name\n",
" verbose=False, # so progress bar isn't broken online\n",
")\n",
"\n",
"plt.show()"
]
},
{
Expand All @@ -850,15 +830,13 @@
},
"outputs": [],
"source": [
"# Commented out as it changes the output dimensions due to its width\n",
"# pyLDAvis_topics(\n",
"# method='lda',\n",
"# text_corpus=text_corpus, \n",
"# input_language=input_language,\n",
"# num_topics=num_topics,\n",
"# save_file=False, # True for pwd or directory name\n",
"# display_ipython=True, # <- show in Jupyter notebook\n",
"# )"
"pyLDAvis_topics(\n",
" method='lda',\n",
" text_corpus=text_corpus, \n",
" num_topics=num_topics,\n",
" save_file=False, # True for pwd or directory name\n",
" display_ipython=True, # <- show in Jupyter notebook\n",
")"
]
},
{
Expand Down Expand Up @@ -916,7 +894,6 @@
"source": [
"gen_word_cloud(\n",
" text_corpus=text_corpus,\n",
" input_language=input_language,\n",
" ignore_words=ignore_words,\n",
" height=500,\n",
" save_file=False, # True for pwd or directory name\n",
Expand Down Expand Up @@ -999,24 +976,20 @@
},
"outputs": [],
"source": [
"# Commented out to avoid long run times\n",
"# gen_files(\n",
"# method=['lda', 'bert', 'lda_bert'],\n",
"# text_corpus=text_corpus, \n",
"# input_language=input_language,\n",
"# output_language=None,\n",
"# num_keywords=num_keywords,\n",
"# topic_nums_to_compare=topic_nums_to_compare,\n",
"# ignore_words=ignore_words,\n",
"# min_freq=2,\n",
"# min_word_len=4,\n",
"# sample_size=1,\n",
"# prompt_remove_words=True,\n",
"# verbose=False, # so progress bar isn't broken online\n",
"# org_by_pos=False, # organize keywords by part of speech\n",
"# incl_visuals=['topic_num_evals', 'word_cloud', 'pyLDAvis'], # t_sne not zipping properly\n",
"# zip_results=True,\n",
"# )"
"gen_files(\n",
" method=['lda', 'bert', 'lda_bert'],\n",
" text_corpus=text_corpus, \n",
" input_language=input_language,\n",
" output_language=None,\n",
" num_keywords=num_keywords,\n",
" topic_nums_to_compare=topic_nums_to_compare,\n",
" ignore_words=ignore_words,\n",
" prompt_remove_words=True,\n",
" verbose=True,\n",
" org_by_pos=False, # organize keywords by part of speech\n",
" incl_visuals=['topic_num_evals', 'word_cloud', 'pyLDAvis'], # t_sne not zipping properly\n",
" zip_results=True,\n",
")"
]
},
{
Expand All @@ -1043,7 +1016,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5"
"version": "3.7.9"
},
"toc": {
"base_numbering": 1,
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

setup_args = dict(
name="kwx",
version="0.1.4",
version="0.1.5",
author="Andrew Tavis McAllister",
author_email="[email protected]",
classifiers=[
Expand Down

0 comments on commit a342c01

Please sign in to comment.