Merge pull request #11 from tgunda/resolve-gensim-update

Resolve gensim update
sandialabs · Apr 26, 2021 · 27babef · 27babef
2 parents ad37bea + 7537963
commit 27babef
Show file tree

Hide file tree

Showing 6 changed files with 1,982 additions and 274 deletions.
diff --git a/docs/conf.py b/docs/conf.py
@@ -14,12 +14,17 @@
 import sys
 
 import mock
-MODULES = ['numpy', 'nltk', 'sklearn.pipeline', 'sklearn.model_selection', 'scipy.sparse', 'pandas',  'scipy', 'sklearn.base', 'gensim.models.doc2vec', 'nltk.tokenize', 'datefinder', 'text_remove_nondate_nums', 'text_remove_numbers_stopwords', 'get_dates', 'gensim.models', 'sklearn.svm', 'sklearn.tree', 'sklearn.neural_network', 'sklearn.linear_model', 'sklearn.ensemble', "sklearn.cluster", "networkx", "matplotlib", "matplotlib.pyplot", "gensim.models.doc2vec", "sklearn.feature_extraction.text", "nltk.tokenize", "plotly.graph_objects", "scipy.signal", 'matplotlib.colors', 'seaborn', 'matplotlib.ticker', 'scipy.signal.find_peaks']
+MODULES = ['numpy', 'nltk', 'sklearn.pipeline', 'sklearn.model_selection', 
+'scipy.sparse', 'pandas',  'scipy', 'sklearn.base', 'gensim.models.doc2vec', 
+'nltk.tokenize', 'datefinder', 'text_remove_nondate_nums', 'text_remove_numbers_stopwords', 
+'get_dates', 'gensim.models', 'sklearn.svm', 'sklearn.tree', 'sklearn.neural_network', 
+'sklearn.linear_model', 'sklearn.ensemble', "sklearn.cluster", "networkx", "matplotlib",
+ "matplotlib.pyplot", "gensim.models.doc2vec", "sklearn.feature_extraction.text", "nltk.tokenize", 
+ "plotly.graph_objects", "scipy.signal", 'matplotlib.colors', 'seaborn', 'matplotlib.ticker', 
+ 'scipy.signal.find_peaks']
 for module in MODULES:
     sys.modules[module] = mock.Mock()
 
-import shlex
-
 sys.path.insert(0, os.path.abspath("../pvops"))
 sys.path.insert(0, os.path.abspath("../pvops/text2time"))
 sys.path.insert(0, os.path.abspath("../pvops/text"))

diff --git a/examples/text_class_example.py b/examples/text_class_example.py
@@ -1,43 +1,27 @@
-import numpy as np
-import pandas as pd
-import pickle
-import traceback
-import nltk
-
 import sys
 import os
 
 sys.path.append("..")
 pvops_directory = os.path.join("..", "pvops")
 sys.path.append(pvops_directory)
 
-# Utilities
+# pvOps subpackages
 from pvops.text import nlp_utils
 from pvops.text import utils
-
-# Visualizations
 from pvops.text import visualize
-
-# Preprocessing
 from pvops.text import preprocess
-
-# Classification
 from pvops.text import classify
-
-# Library example definitions
 from pvops.text import defaults
 
-# Embedding
+import nltk
+import traceback
+import pandas as pd
+import numpy as np
 from sklearn.feature_extraction.text import TfidfVectorizer
 from gensim.models.doc2vec import Doc2Vec
-
-# Clustering
 from sklearn.cluster import KMeans
-
-# Scoring
 from sklearn.metrics import make_scorer, f1_score, homogeneity_score
 
-
 class Example:
     def __init__(self, df, LABEL_COLUMN):
         self.LABEL_COLUMN = LABEL_COLUMN
@@ -75,7 +59,7 @@ def extract_dates(
         EVENTSTART_COLUMN,
         SAVE_DATA_COLUMN="CleanDesc",
         SAVE_DATE_COLUMN="ExtractedDates",
-        print_info = False,
+        print_info=False,
     ):
 
         col_dict = {
@@ -505,7 +489,7 @@ def predict_best_model(
                     output_col = f"Unsupervised_Pred_{self.LABEL_COLUMN}"
 
         self.df[output_col] = pred_y
-        print(f"Predictions stored to {output_col} in `df` attribute")
+        print(f"Predictions stored to {output_col} in `df`")
 
         print(f"Score: {score}")
 
@@ -519,9 +503,6 @@ def predict_best_model(
     df = pd.read_csv(folder + filename)
 
     e = Example(df, LABEL_COLUMN)
-    # df = e.extract_dates(DATA_COLUMN, DATE_COLUMN, SAVE_DATE_COLUMN='ExtractedDates')
-    e.prep_data_for_ML(DATA_COLUMN, DATE_COLUMN)
-    # e.test_doc2vec()
-    # Setting few cross validation splits because of few example data
-    e.classify_supervised(n_cv_splits=2, embedding="doc2vec")
-    e.predict_best_model()
+    e.summarize_text_data(DATA_COLUMN)
+
+    print("\nMessage from pvOps team: See `tutorial_textmodule.ipynb` for a more in-depth demonstration of the text module's functionality.")
diff --git a/examples/tutorial_textmodule.ipynb b/examples/tutorial_textmodule.ipynb
diff --git a/pvops/text/nlp_utils.py b/pvops/text/nlp_utils.py
@@ -39,6 +39,7 @@ def __init__(
         self.dv_mapfile = dv_mapfile
         self.comment = comment
         self.trim_rule = trim_rule
+        self.callbacks = callbacks
         self.window = window
         self.epochs = epochs
 
@@ -70,7 +71,7 @@ def fit(self, raw_documents, y=None):
         self.d2v_model.train(
             tagged_documents,
             total_examples=len(tagged_documents),
-            epochs=self.d2v_model.iter,
+            epochs=self.d2v_model.epochs,
         )
         return self
 

diff --git a/pvops/text/utils.py b/pvops/text/utils.py
@@ -31,10 +31,10 @@ def summarize_text_data(om_df, colname):
     tokenized = [sentence.split() for sentence in nonan_text]
     avg_n_words = np.array([len(tokens) for tokens in tokenized]).mean()
     sum_n_words = np.array([len(tokens) for tokens in tokenized]).sum()
-    model = Word2Vec(tokenized, min_count=1, size=64)
+    model = Word2Vec(tokenized, min_count=1)
 
     # Total vocabulary
-    vocab = model.wv.vocab
+    vocab = model.wv
 
     # Bold title.
     print("\033[1m" + "DETAILS" + "\033[0m")

diff --git a/requirements.txt b/requirements.txt
@@ -7,5 +7,5 @@ datefinder
 matplotlib
 seaborn
 plotly
-gensim
+gensim>=4.0.0
 networkx