Skip to content

Commit

Permalink
Merge pull request #11 from tgunda/resolve-gensim-update
Browse files Browse the repository at this point in the history
Resolve gensim update
  • Loading branch information
MichaelHopwood authored Apr 26, 2021
2 parents ad37bea + 7537963 commit 27babef
Show file tree
Hide file tree
Showing 6 changed files with 1,982 additions and 274 deletions.
11 changes: 8 additions & 3 deletions docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,17 @@
import sys

import mock
MODULES = ['numpy', 'nltk', 'sklearn.pipeline', 'sklearn.model_selection', 'scipy.sparse', 'pandas', 'scipy', 'sklearn.base', 'gensim.models.doc2vec', 'nltk.tokenize', 'datefinder', 'text_remove_nondate_nums', 'text_remove_numbers_stopwords', 'get_dates', 'gensim.models', 'sklearn.svm', 'sklearn.tree', 'sklearn.neural_network', 'sklearn.linear_model', 'sklearn.ensemble', "sklearn.cluster", "networkx", "matplotlib", "matplotlib.pyplot", "gensim.models.doc2vec", "sklearn.feature_extraction.text", "nltk.tokenize", "plotly.graph_objects", "scipy.signal", 'matplotlib.colors', 'seaborn', 'matplotlib.ticker', 'scipy.signal.find_peaks']
MODULES = ['numpy', 'nltk', 'sklearn.pipeline', 'sklearn.model_selection',
'scipy.sparse', 'pandas', 'scipy', 'sklearn.base', 'gensim.models.doc2vec',
'nltk.tokenize', 'datefinder', 'text_remove_nondate_nums', 'text_remove_numbers_stopwords',
'get_dates', 'gensim.models', 'sklearn.svm', 'sklearn.tree', 'sklearn.neural_network',
'sklearn.linear_model', 'sklearn.ensemble', "sklearn.cluster", "networkx", "matplotlib",
"matplotlib.pyplot", "gensim.models.doc2vec", "sklearn.feature_extraction.text", "nltk.tokenize",
"plotly.graph_objects", "scipy.signal", 'matplotlib.colors', 'seaborn', 'matplotlib.ticker',
'scipy.signal.find_peaks']
for module in MODULES:
sys.modules[module] = mock.Mock()

import shlex

sys.path.insert(0, os.path.abspath("../pvops"))
sys.path.insert(0, os.path.abspath("../pvops/text2time"))
sys.path.insert(0, os.path.abspath("../pvops/text"))
Expand Down
39 changes: 10 additions & 29 deletions examples/text_class_example.py
Original file line number Diff line number Diff line change
@@ -1,43 +1,27 @@
import numpy as np
import pandas as pd
import pickle
import traceback
import nltk

import sys
import os

sys.path.append("..")
pvops_directory = os.path.join("..", "pvops")
sys.path.append(pvops_directory)

# Utilities
# pvOps subpackages
from pvops.text import nlp_utils
from pvops.text import utils

# Visualizations
from pvops.text import visualize

# Preprocessing
from pvops.text import preprocess

# Classification
from pvops.text import classify

# Library example definitions
from pvops.text import defaults

# Embedding
import nltk
import traceback
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models.doc2vec import Doc2Vec

# Clustering
from sklearn.cluster import KMeans

# Scoring
from sklearn.metrics import make_scorer, f1_score, homogeneity_score


class Example:
def __init__(self, df, LABEL_COLUMN):
self.LABEL_COLUMN = LABEL_COLUMN
Expand Down Expand Up @@ -75,7 +59,7 @@ def extract_dates(
EVENTSTART_COLUMN,
SAVE_DATA_COLUMN="CleanDesc",
SAVE_DATE_COLUMN="ExtractedDates",
print_info = False,
print_info=False,
):

col_dict = {
Expand Down Expand Up @@ -505,7 +489,7 @@ def predict_best_model(
output_col = f"Unsupervised_Pred_{self.LABEL_COLUMN}"

self.df[output_col] = pred_y
print(f"Predictions stored to {output_col} in `df` attribute")
print(f"Predictions stored to {output_col} in `df`")

print(f"Score: {score}")

Expand All @@ -519,9 +503,6 @@ def predict_best_model(
df = pd.read_csv(folder + filename)

e = Example(df, LABEL_COLUMN)
# df = e.extract_dates(DATA_COLUMN, DATE_COLUMN, SAVE_DATE_COLUMN='ExtractedDates')
e.prep_data_for_ML(DATA_COLUMN, DATE_COLUMN)
# e.test_doc2vec()
# Setting few cross validation splits because of few example data
e.classify_supervised(n_cv_splits=2, embedding="doc2vec")
e.predict_best_model()
e.summarize_text_data(DATA_COLUMN)

print("\nMessage from pvOps team: See `tutorial_textmodule.ipynb` for a more in-depth demonstration of the text module's functionality.")
2,197 changes: 1,959 additions & 238 deletions examples/tutorial_textmodule.ipynb

Large diffs are not rendered by default.

3 changes: 2 additions & 1 deletion pvops/text/nlp_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ def __init__(
self.dv_mapfile = dv_mapfile
self.comment = comment
self.trim_rule = trim_rule
self.callbacks = callbacks
self.window = window
self.epochs = epochs

Expand Down Expand Up @@ -70,7 +71,7 @@ def fit(self, raw_documents, y=None):
self.d2v_model.train(
tagged_documents,
total_examples=len(tagged_documents),
epochs=self.d2v_model.iter,
epochs=self.d2v_model.epochs,
)
return self

Expand Down
4 changes: 2 additions & 2 deletions pvops/text/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,10 @@ def summarize_text_data(om_df, colname):
tokenized = [sentence.split() for sentence in nonan_text]
avg_n_words = np.array([len(tokens) for tokens in tokenized]).mean()
sum_n_words = np.array([len(tokens) for tokens in tokenized]).sum()
model = Word2Vec(tokenized, min_count=1, size=64)
model = Word2Vec(tokenized, min_count=1)

# Total vocabulary
vocab = model.wv.vocab
vocab = model.wv

# Bold title.
print("\033[1m" + "DETAILS" + "\033[0m")
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,5 @@ datefinder
matplotlib
seaborn
plotly
gensim
gensim>=4.0.0
networkx

0 comments on commit 27babef

Please sign in to comment.