From b07786a2cbac31e5dbab744a31965dc8dfef8cee Mon Sep 17 00:00:00 2001 From: Michael Gasvoda Date: Tue, 19 Dec 2017 14:09:42 -0500 Subject: [PATCH 01/16] modifying lda class structure --- quantgov/estimator/candidate_sets.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/quantgov/estimator/candidate_sets.py b/quantgov/estimator/candidate_sets.py index 97978fa..84e0198 100644 --- a/quantgov/estimator/candidate_sets.py +++ b/quantgov/estimator/candidate_sets.py @@ -17,9 +17,24 @@ import sklearn.multioutput import sklearn.pipeline import sklearn.feature_extraction +from decorator import decorator +from . import structures + +try: + import gensim +except ImportError: + gensim = None import quantgov.estimator + +@decorator +def check_gensim(func, *args, **kwargs): + if gensim is None: + raise RuntimeError('Must install gensim to use {}'.format(func)) + return func(*args, **kwargs) + + classification = [ quantgov.estimator.CandidateModel( name="Random Forests", @@ -69,3 +84,16 @@ } ), ] + +topic_modeling = [ + quantgov.estimator.CandidateModel( + name="LDA", + model=sklearn.pipeline.Pipeline(steps=( + ('corpus creation', structures.TopicPreprocessor()), + ('lda', gensim.sklearn_api.ldamode.LdaTransformer( + # id2word=dictionary, + passes=1 + )), + )), + ), +] From 257131d329e062ed4c9dce164a34f78eec5f1de0 Mon Sep 17 00:00:00 2001 From: Michael Gasvoda Date: Tue, 19 Dec 2017 14:13:21 -0500 Subject: [PATCH 02/16] adjusting lda model, adding parameters for grid search --- quantgov/estimator/candidate_sets.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/quantgov/estimator/candidate_sets.py b/quantgov/estimator/candidate_sets.py index 84e0198..e59b4a2 100644 --- a/quantgov/estimator/candidate_sets.py +++ b/quantgov/estimator/candidate_sets.py @@ -88,12 +88,11 @@ def check_gensim(func, *args, **kwargs): topic_modeling = [ quantgov.estimator.CandidateModel( name="LDA", - model=sklearn.pipeline.Pipeline(steps=( - ('corpus creation', structures.TopicPreprocessor()), - ('lda', gensim.sklearn_api.ldamode.LdaTransformer( - # id2word=dictionary, - passes=1 - )), - )), + model=structures.QGLdaModel(), + parameters={ + 'eta': [0.1, 0.05, 0.01], + 'passes': [1, 2, 3], + 'num_topics': [10, 50, 100] + } ), ] From 265cc83b06c4c3f37510cecf6a614364b222a80d Mon Sep 17 00:00:00 2001 From: Michael Gasvoda Date: Tue, 19 Dec 2017 14:13:42 -0500 Subject: [PATCH 03/16] adjusting lda model, adding parameters for grid search --- quantgov/estimator/structures.py | 62 ++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/quantgov/estimator/structures.py b/quantgov/estimator/structures.py index 8ef59ea..9dd89db 100644 --- a/quantgov/estimator/structures.py +++ b/quantgov/estimator/structures.py @@ -5,6 +5,33 @@ """ import collections import joblib as jl +from sklearn.base import BaseEstimator, TransformerMixin +from six import iteritems +from decorator import decorator +import re + +try: + from spacy.lang.en.stop_words import STOP_WORDS + from gensim.corpora import Dictionary + import gensim + spacy = True +except ImportError: + spacy = None + gensim = None + + +@decorator +def check_spacy(func, *args, **kwargs): + if spacy is None: + raise RuntimeError('Must install spacy to use {}'.format(func)) + return func(*args, **kwargs) + + +@decorator +def check_gensim(func, *args, **kwargs): + if gensim is None: + raise RuntimeError('Must install gensim to use {}'.format(func)) + return func(*args, **kwargs) class _PersistanceMixin(object): @@ -85,3 +112,38 @@ class CandidateModel( parameter values to test as values """ pass + + +class QGLdaModel(BaseEstimator, TransformerMixin): + def __init__(self, word_regex=r'\b[A-z]{2,}\b', stop_words=STOP_WORDS): + self.stop_words = stop_words + self.word_regex = re.compile(word_regex) + + def transform(self, driver): + return self.model.transform(driver.stream) + + def create_corpus(self, driver): + return [self.dictionary.doc2bow([i.group(0) + for i in self.word_regex.finditer(doc.text)]) + for doc in driver.stream()] + + @check_gensim + @check_spacy + def fit(self, driver, alpha=None, eta=None, num_topics=None, passes=None): + self.dictionary = Dictionary([[i.group(0) + for i in self.word_regex + .finditer(doc.text)] + for doc in driver.stream()]) + stop_ids = [self.dictionary.token2id[stopword] for stopword + in self.stop_words if stopword in self.dictionary.token2id] + once_ids = [tokenid for tokenid, docfreq in + iteritems(self.dictionary.dfs) if docfreq == 1] + self.dictionary.filter_tokens(stop_ids + once_ids) + self.corpus = self.create_corpus(driver) + self.model = gensim.models.ldamodel.LdaModel(self.corpus, + id2word=self.dictionary, + alpha=alpha + eta=eta, + num_topics=num_topics, + passes=passes) + return self From bb5f0369f91305dce8b0b588c848d950d6d7cfca Mon Sep 17 00:00:00 2001 From: Michael Gasvoda Date: Tue, 19 Dec 2017 14:14:31 -0500 Subject: [PATCH 04/16] adding topic modeling dependencies --- setup.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/setup.py b/setup.py index 3d424b1..910fde8 100644 --- a/setup.py +++ b/setup.py @@ -65,6 +65,10 @@ def find_version(*file_paths): 'nlp': [ 'textblob', 'nltk', + ], + 'topic_modeling': [ + 'gensim', + 'spacy' ] }, entry_points={ From 2a15b324b20475de69c80e82064e2826c8096fc0 Mon Sep 17 00:00:00 2001 From: Michael Gasvoda Date: Tue, 19 Dec 2017 14:19:49 -0500 Subject: [PATCH 05/16] fixing syntax --- quantgov/estimator/structures.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/quantgov/estimator/structures.py b/quantgov/estimator/structures.py index 9dd89db..199d9d0 100644 --- a/quantgov/estimator/structures.py +++ b/quantgov/estimator/structures.py @@ -142,7 +142,7 @@ def fit(self, driver, alpha=None, eta=None, num_topics=None, passes=None): self.corpus = self.create_corpus(driver) self.model = gensim.models.ldamodel.LdaModel(self.corpus, id2word=self.dictionary, - alpha=alpha + alpha=alpha, eta=eta, num_topics=num_topics, passes=passes) From c3faebfff500fc50d8ecf1b1c0bf569076d03c91 Mon Sep 17 00:00:00 2001 From: Michael Gasvoda Date: Tue, 19 Dec 2017 16:32:44 -0500 Subject: [PATCH 06/16] lowercasing words --- quantgov/estimator/structures.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/quantgov/estimator/structures.py b/quantgov/estimator/structures.py index 199d9d0..816b6e5 100644 --- a/quantgov/estimator/structures.py +++ b/quantgov/estimator/structures.py @@ -123,14 +123,14 @@ def transform(self, driver): return self.model.transform(driver.stream) def create_corpus(self, driver): - return [self.dictionary.doc2bow([i.group(0) + return [self.dictionary.doc2bow([i.group(0).lower() for i in self.word_regex.finditer(doc.text)]) for doc in driver.stream()] @check_gensim @check_spacy def fit(self, driver, alpha=None, eta=None, num_topics=None, passes=None): - self.dictionary = Dictionary([[i.group(0) + self.dictionary = Dictionary([[i.group(0).lower() for i in self.word_regex .finditer(doc.text)] for doc in driver.stream()]) From c6b764d9809ba4d4102ddafa50889e1188c7ba8a Mon Sep 17 00:00:00 2001 From: Michael Gasvoda Date: Tue, 2 Jan 2018 12:35:35 -0500 Subject: [PATCH 07/16] adding sklearn api version --- quantgov/estimator/structures.py | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/quantgov/estimator/structures.py b/quantgov/estimator/structures.py index 816b6e5..9aad0f3 100644 --- a/quantgov/estimator/structures.py +++ b/quantgov/estimator/structures.py @@ -13,6 +13,7 @@ try: from spacy.lang.en.stop_words import STOP_WORDS from gensim.corpora import Dictionary + from gensim import sklearn_api import gensim spacy = True except ImportError: @@ -115,20 +116,21 @@ class CandidateModel( class QGLdaModel(BaseEstimator, TransformerMixin): + @check_gensim + @check_spacy def __init__(self, word_regex=r'\b[A-z]{2,}\b', stop_words=STOP_WORDS): self.stop_words = stop_words self.word_regex = re.compile(word_regex) def transform(self, driver): - return self.model.transform(driver.stream) + self.test_corpus = self.create_corpus(driver) + return self.model.transform(self.test_corpus) def create_corpus(self, driver): return [self.dictionary.doc2bow([i.group(0).lower() for i in self.word_regex.finditer(doc.text)]) for doc in driver.stream()] - @check_gensim - @check_spacy def fit(self, driver, alpha=None, eta=None, num_topics=None, passes=None): self.dictionary = Dictionary([[i.group(0).lower() for i in self.word_regex @@ -140,10 +142,12 @@ def fit(self, driver, alpha=None, eta=None, num_topics=None, passes=None): iteritems(self.dictionary.dfs) if docfreq == 1] self.dictionary.filter_tokens(stop_ids + once_ids) self.corpus = self.create_corpus(driver) - self.model = gensim.models.ldamodel.LdaModel(self.corpus, - id2word=self.dictionary, - alpha=alpha, - eta=eta, - num_topics=num_topics, - passes=passes) + self.model = sklearn_api.ldamodel.LdaTransformer( + alpha=alpha, + eta=eta, + num_topics=num_topics, + passes=passes, + id2word=self.dictionary + ) + self.model.fit(self.corpus) return self From a17ed349c7f8bdd67caa88b1301caf46ed7c83cb Mon Sep 17 00:00:00 2001 From: Michael Gasvoda Date: Tue, 2 Jan 2018 12:36:25 -0500 Subject: [PATCH 08/16] initial commit testing topic model --- tests/test_estimators.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 tests/test_estimators.py diff --git a/tests/test_estimators.py b/tests/test_estimators.py new file mode 100644 index 0000000..5b099aa --- /dev/null +++ b/tests/test_estimators.py @@ -0,0 +1,27 @@ +# import pytest +import subprocess +import quantgov.estimator +import quantgov + +from pathlib import Path + +PSEUDO_CORPUS_PATH = Path(__file__).resolve().parent.joinpath('pseudo_corpus') +driver = quantgov.load_driver(PSEUDO_CORPUS_PATH) +# models = quantgov.estimator.utils.load_models('./sample_models.py') + + +# def test_all_model_evaluation(): +# quantgov.estimator.evaluation.evaluate_all_models(models, X, y, 2, 'f1') + + +def test_topic_model(): + sample = quantgov.estimator.structures.QGLdaModel() + sample.fit(driver) + sample.transform(driver) + + +def check_output(cmd): + return ( + subprocess.check_output(cmd, universal_newlines=True) + .replace('\n\n', '\n') + ) From d5592f0d829801bf302c3652ad8c38c6354316c3 Mon Sep 17 00:00:00 2001 From: Michael Gasvoda Date: Tue, 2 Jan 2018 12:37:14 -0500 Subject: [PATCH 09/16] loading all models for testing --- tests/sample_models.py | 98 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 98 insertions(+) create mode 100644 tests/sample_models.py diff --git a/tests/sample_models.py b/tests/sample_models.py new file mode 100644 index 0000000..38be9a5 --- /dev/null +++ b/tests/sample_models.py @@ -0,0 +1,98 @@ +""" +quantgov.estimator.candidate_sets: Starter model candidate sets + + +This module provides a few sample sets of models for common problems. These are +mostly helpful for initial analysis; in general, you will want to customize +these. + +The currently included candidates sets are: + * `classificaiton`: Random Forests and Logit with TF-IDF preprocessor + * `multilabel_classificaiton`: same as classification, with the Logit + classifier wrapped in a MultiOutputClassifier +""" +import numpy as np +import sklearn.ensemble +import sklearn.linear_model +import sklearn.multioutput +import sklearn.pipeline +import sklearn.feature_extraction +from decorator import decorator +# from . import structures + +try: + import gensim +except ImportError: + gensim = None + +import quantgov.estimator + + +# @decorator +# def check_gensim(func, *args, **kwargs): +# if gensim is None: +# raise RuntimeError('Must install gensim to use {}'.format(func)) +# return func(*args, **kwargs) + + +models = [ + quantgov.estimator.CandidateModel( + name="Random Forests", + model=sklearn.pipeline.Pipeline(steps=( + ('tfidf', sklearn.feature_extraction.text.TfidfTransformer()), + ('rf', sklearn.ensemble.RandomForestClassifier(n_jobs=-1)), + )), + parameters={ + 'rf__n_estimators': [5, 10, 25, 50, 100], + } + ), + quantgov.estimator.CandidateModel( + name="Logistic Regression", + model=sklearn.pipeline.Pipeline(steps=( + ('tfidf', sklearn.feature_extraction.text.TfidfTransformer()), + ('logit', sklearn.linear_model.LogisticRegression()), + )), + parameters={ + 'logit__C': np.logspace(-2, 2, 5) + } + ), +] + + +multilabel_classification = [ + quantgov.estimator.CandidateModel( + name="Random Forests", + model=sklearn.pipeline.Pipeline(steps=( + ('tfidf', sklearn.feature_extraction.text.TfidfTransformer()), + ('rf', sklearn.ensemble.RandomForestClassifier(n_jobs=-1)), + )), + parameters={ + 'rf__n_estimators': [5, 10, 25, 50, 100], + } + ), + quantgov.estimator.CandidateModel( + name="Logistic Regression", + model=sklearn.pipeline.Pipeline(steps=( + ('tfidf', sklearn.feature_extraction.text.TfidfTransformer()), + ('logit', sklearn.multioutput.MultiOutputClassifier( + sklearn.linear_model.LogisticRegression(), + n_jobs=-1 + )), + )), + parameters={ + 'logit__estimator__C': np.logspace(-2, 2, 5) + } + ), +] + +topic_modeling = [ + quantgov.estimator.CandidateModel( + name="LDA", + model=quantgov.estimator.structures.QGLdaModel(), + parameters={ + 'eta': [0.1, 0.05, 0.01], + 'passes': [1, 2, 3], + 'num_topics': [10, 50, 100] + } + ), +] From 58513f7cd00e074d03e3b9f8acb494e70d188b8f Mon Sep 17 00:00:00 2001 From: Michael Gasvoda Date: Tue, 2 Jan 2018 13:03:18 -0500 Subject: [PATCH 10/16] passing topic model test --- quantgov/estimator/structures.py | 2 +- tests/sample_models.py | 98 -------------------------------- tests/test_estimators.py | 7 +-- 3 files changed, 2 insertions(+), 105 deletions(-) delete mode 100644 tests/sample_models.py diff --git a/quantgov/estimator/structures.py b/quantgov/estimator/structures.py index 9aad0f3..47fa7e9 100644 --- a/quantgov/estimator/structures.py +++ b/quantgov/estimator/structures.py @@ -131,7 +131,7 @@ def create_corpus(self, driver): for i in self.word_regex.finditer(doc.text)]) for doc in driver.stream()] - def fit(self, driver, alpha=None, eta=None, num_topics=None, passes=None): + def fit(self, driver, alpha=None, eta=None, num_topics=1, passes=1): self.dictionary = Dictionary([[i.group(0).lower() for i in self.word_regex .finditer(doc.text)] diff --git a/tests/sample_models.py b/tests/sample_models.py deleted file mode 100644 index 38be9a5..0000000 --- a/tests/sample_models.py +++ /dev/null @@ -1,98 +0,0 @@ -""" -quantgov.estimator.candidate_sets: Starter model candidate sets - - -This module provides a few sample sets of models for common problems. These are -mostly helpful for initial analysis; in general, you will want to customize -these. - -The currently included candidates sets are: - * `classificaiton`: Random Forests and Logit with TF-IDF preprocessor - * `multilabel_classificaiton`: same as classification, with the Logit - classifier wrapped in a MultiOutputClassifier -""" -import numpy as np -import sklearn.ensemble -import sklearn.linear_model -import sklearn.multioutput -import sklearn.pipeline -import sklearn.feature_extraction -from decorator import decorator -# from . import structures - -try: - import gensim -except ImportError: - gensim = None - -import quantgov.estimator - - -# @decorator -# def check_gensim(func, *args, **kwargs): -# if gensim is None: -# raise RuntimeError('Must install gensim to use {}'.format(func)) -# return func(*args, **kwargs) - - -models = [ - quantgov.estimator.CandidateModel( - name="Random Forests", - model=sklearn.pipeline.Pipeline(steps=( - ('tfidf', sklearn.feature_extraction.text.TfidfTransformer()), - ('rf', sklearn.ensemble.RandomForestClassifier(n_jobs=-1)), - )), - parameters={ - 'rf__n_estimators': [5, 10, 25, 50, 100], - } - ), - quantgov.estimator.CandidateModel( - name="Logistic Regression", - model=sklearn.pipeline.Pipeline(steps=( - ('tfidf', sklearn.feature_extraction.text.TfidfTransformer()), - ('logit', sklearn.linear_model.LogisticRegression()), - )), - parameters={ - 'logit__C': np.logspace(-2, 2, 5) - } - ), -] - - -multilabel_classification = [ - quantgov.estimator.CandidateModel( - name="Random Forests", - model=sklearn.pipeline.Pipeline(steps=( - ('tfidf', sklearn.feature_extraction.text.TfidfTransformer()), - ('rf', sklearn.ensemble.RandomForestClassifier(n_jobs=-1)), - )), - parameters={ - 'rf__n_estimators': [5, 10, 25, 50, 100], - } - ), - quantgov.estimator.CandidateModel( - name="Logistic Regression", - model=sklearn.pipeline.Pipeline(steps=( - ('tfidf', sklearn.feature_extraction.text.TfidfTransformer()), - ('logit', sklearn.multioutput.MultiOutputClassifier( - sklearn.linear_model.LogisticRegression(), - n_jobs=-1 - )), - )), - parameters={ - 'logit__estimator__C': np.logspace(-2, 2, 5) - } - ), -] - -topic_modeling = [ - quantgov.estimator.CandidateModel( - name="LDA", - model=quantgov.estimator.structures.QGLdaModel(), - parameters={ - 'eta': [0.1, 0.05, 0.01], - 'passes': [1, 2, 3], - 'num_topics': [10, 50, 100] - } - ), -] diff --git a/tests/test_estimators.py b/tests/test_estimators.py index 5b099aa..94af230 100644 --- a/tests/test_estimators.py +++ b/tests/test_estimators.py @@ -7,16 +7,11 @@ PSEUDO_CORPUS_PATH = Path(__file__).resolve().parent.joinpath('pseudo_corpus') driver = quantgov.load_driver(PSEUDO_CORPUS_PATH) -# models = quantgov.estimator.utils.load_models('./sample_models.py') - - -# def test_all_model_evaluation(): -# quantgov.estimator.evaluation.evaluate_all_models(models, X, y, 2, 'f1') def test_topic_model(): sample = quantgov.estimator.structures.QGLdaModel() - sample.fit(driver) + sample.fit(driver, num_topics=2) sample.transform(driver) From aac7d98b0f349c1e47a756719e0c74d20aa3064e Mon Sep 17 00:00:00 2001 From: Michael Gasvoda Date: Tue, 2 Jan 2018 13:08:46 -0500 Subject: [PATCH 11/16] updating travis config dependencies --- .travis.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.travis.yml b/.travis.yml index 75cc0ab..1833fc7 100644 --- a/.travis.yml +++ b/.travis.yml @@ -5,6 +5,7 @@ python: install: - pip install ".[testing]" - pip install ".[nlp]" +- pip install ".[topic_modeling]" - python -m nltk.downloader punkt stopwords wordnet script: pytest deploy: From ef9853f0f6d1638d6c50c73aea5bc9d3d3864e1b Mon Sep 17 00:00:00 2001 From: Michael Gasvoda Date: Tue, 2 Jan 2018 13:18:42 -0500 Subject: [PATCH 12/16] checking import before initializing --- quantgov/estimator/candidate_sets.py | 37 ++++++++++++++-------------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/quantgov/estimator/candidate_sets.py b/quantgov/estimator/candidate_sets.py index e59b4a2..7e88942 100644 --- a/quantgov/estimator/candidate_sets.py +++ b/quantgov/estimator/candidate_sets.py @@ -17,24 +17,22 @@ import sklearn.multioutput import sklearn.pipeline import sklearn.feature_extraction -from decorator import decorator from . import structures try: import gensim except ImportError: gensim = None +try: + import gensim + import spacy +except ImportError: + spacy = None + gensim = None import quantgov.estimator -@decorator -def check_gensim(func, *args, **kwargs): - if gensim is None: - raise RuntimeError('Must install gensim to use {}'.format(func)) - return func(*args, **kwargs) - - classification = [ quantgov.estimator.CandidateModel( name="Random Forests", @@ -85,14 +83,15 @@ def check_gensim(func, *args, **kwargs): ), ] -topic_modeling = [ - quantgov.estimator.CandidateModel( - name="LDA", - model=structures.QGLdaModel(), - parameters={ - 'eta': [0.1, 0.05, 0.01], - 'passes': [1, 2, 3], - 'num_topics': [10, 50, 100] - } - ), -] +if gensim and spacy: + topic_modeling = [ + quantgov.estimator.CandidateModel( + name="LDA", + model=structures.QGLdaModel(), + parameters={ + 'eta': [0.1, 0.05, 0.01], + 'passes': [1, 2, 3], + 'num_topics': [10, 50, 100] + } + ), + ] From c97c6dfc63fe88ecb8857b211b8783603ae43648 Mon Sep 17 00:00:00 2001 From: Michael Gasvoda Date: Thu, 18 Jan 2018 10:25:35 -0500 Subject: [PATCH 13/16] adjusting dependencies, variable names --- quantgov/estimator/candidate_sets.py | 2 +- quantgov/estimator/structures.py | 21 +++++++-------------- tests/test_estimators.py | 2 +- 3 files changed, 9 insertions(+), 16 deletions(-) diff --git a/quantgov/estimator/candidate_sets.py b/quantgov/estimator/candidate_sets.py index 7e88942..38e6813 100644 --- a/quantgov/estimator/candidate_sets.py +++ b/quantgov/estimator/candidate_sets.py @@ -87,7 +87,7 @@ topic_modeling = [ quantgov.estimator.CandidateModel( name="LDA", - model=structures.QGLdaModel(), + model=structures.GensimLda(), parameters={ 'eta': [0.1, 0.05, 0.01], 'passes': [1, 2, 3], diff --git a/quantgov/estimator/structures.py b/quantgov/estimator/structures.py index 47fa7e9..000e48f 100644 --- a/quantgov/estimator/structures.py +++ b/quantgov/estimator/structures.py @@ -11,21 +11,15 @@ import re try: - from spacy.lang.en.stop_words import STOP_WORDS from gensim.corpora import Dictionary from gensim import sklearn_api import gensim - spacy = True except ImportError: - spacy = None gensim = None -@decorator -def check_spacy(func, *args, **kwargs): - if spacy is None: - raise RuntimeError('Must install spacy to use {}'.format(func)) - return func(*args, **kwargs) +from sklearn.feature_extraction import stop_words +STOP_WORDS = stop_words.ENGLISH_STOP_WORDS @decorator @@ -115,12 +109,11 @@ class CandidateModel( pass -class QGLdaModel(BaseEstimator, TransformerMixin): +class GensimLda(BaseEstimator, TransformerMixin): @check_gensim - @check_spacy - def __init__(self, word_regex=r'\b[A-z]{2,}\b', stop_words=STOP_WORDS): + def __init__(self, word_pattern=r'\b[A-z]{2,}\b', stop_words=STOP_WORDS): self.stop_words = stop_words - self.word_regex = re.compile(word_regex) + self.word_pattern = re.compile(word_pattern) def transform(self, driver): self.test_corpus = self.create_corpus(driver) @@ -128,12 +121,12 @@ def transform(self, driver): def create_corpus(self, driver): return [self.dictionary.doc2bow([i.group(0).lower() - for i in self.word_regex.finditer(doc.text)]) + for i in self.word_pattern.finditer(doc.text)]) for doc in driver.stream()] def fit(self, driver, alpha=None, eta=None, num_topics=1, passes=1): self.dictionary = Dictionary([[i.group(0).lower() - for i in self.word_regex + for i in self.word_pattern .finditer(doc.text)] for doc in driver.stream()]) stop_ids = [self.dictionary.token2id[stopword] for stopword diff --git a/tests/test_estimators.py b/tests/test_estimators.py index 94af230..2b52c5d 100644 --- a/tests/test_estimators.py +++ b/tests/test_estimators.py @@ -10,7 +10,7 @@ def test_topic_model(): - sample = quantgov.estimator.structures.QGLdaModel() + sample = quantgov.estimator.structures.GensimLda() sample.fit(driver, num_topics=2) sample.transform(driver) From aaffd9419caa0d364573bd1d38761984eeccd8fe Mon Sep 17 00:00:00 2001 From: Michael Gasvoda Date: Thu, 18 Jan 2018 10:31:16 -0500 Subject: [PATCH 14/16] moving stopword removal --- quantgov/estimator/structures.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/quantgov/estimator/structures.py b/quantgov/estimator/structures.py index 000e48f..992a3ed 100644 --- a/quantgov/estimator/structures.py +++ b/quantgov/estimator/structures.py @@ -127,13 +127,12 @@ def create_corpus(self, driver): def fit(self, driver, alpha=None, eta=None, num_topics=1, passes=1): self.dictionary = Dictionary([[i.group(0).lower() for i in self.word_pattern - .finditer(doc.text)] + .finditer(doc.text) + if i not in self.stop_words] for doc in driver.stream()]) - stop_ids = [self.dictionary.token2id[stopword] for stopword - in self.stop_words if stopword in self.dictionary.token2id] once_ids = [tokenid for tokenid, docfreq in iteritems(self.dictionary.dfs) if docfreq == 1] - self.dictionary.filter_tokens(stop_ids + once_ids) + self.dictionary.filter_tokens(once_ids) self.corpus = self.create_corpus(driver) self.model = sklearn_api.ldamodel.LdaTransformer( alpha=alpha, From 58db43ab23c01b0840a379433b4d5ba7724fd7df Mon Sep 17 00:00:00 2001 From: Michael Gasvoda Date: Thu, 18 Jan 2018 10:46:11 -0500 Subject: [PATCH 15/16] restructuring stopword args, adjusting min word freq --- quantgov/estimator/structures.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/quantgov/estimator/structures.py b/quantgov/estimator/structures.py index 992a3ed..1f32bc6 100644 --- a/quantgov/estimator/structures.py +++ b/quantgov/estimator/structures.py @@ -111,8 +111,14 @@ class CandidateModel( class GensimLda(BaseEstimator, TransformerMixin): @check_gensim - def __init__(self, word_pattern=r'\b[A-z]{2,}\b', stop_words=STOP_WORDS): - self.stop_words = stop_words + def __init__(self, word_pattern=r'\b[A-z]{2,}\b', stop_words='en'): + if stop_words == 'en': + self.stop_words = STOP_WORDS + elif not stop_words: + self.stop_words = None + else: + self.stop_words = stop_words + self.word_pattern = re.compile(word_pattern) def transform(self, driver): @@ -124,15 +130,17 @@ def create_corpus(self, driver): for i in self.word_pattern.finditer(doc.text)]) for doc in driver.stream()] - def fit(self, driver, alpha=None, eta=None, num_topics=1, passes=1): + def fit(self, driver, alpha=None, eta=None, num_topics=1, + passes=1, min_wf=1): self.dictionary = Dictionary([[i.group(0).lower() for i in self.word_pattern - .finditer(doc.text) - if i not in self.stop_words] + .finditer(doc.text)] for doc in driver.stream()]) + stop_ids = [self.dictionary.token2id[stopword] for stopword + in self.stop_words if stopword in self.dictionary.token2id] once_ids = [tokenid for tokenid, docfreq in - iteritems(self.dictionary.dfs) if docfreq == 1] - self.dictionary.filter_tokens(once_ids) + iteritems(self.dictionary.dfs) if docfreq <= min_wf] + self.dictionary.filter_tokens(stop_ids + once_ids) self.corpus = self.create_corpus(driver) self.model = sklearn_api.ldamodel.LdaTransformer( alpha=alpha, From 30af4c2acb9d22752a382e63e857489d30f0b24d Mon Sep 17 00:00:00 2001 From: Michael Gasvoda Date: Thu, 1 Mar 2018 14:45:30 -0500 Subject: [PATCH 16/16] providing wrapper for show_topics --- quantgov/estimator/structures.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/quantgov/estimator/structures.py b/quantgov/estimator/structures.py index 1f32bc6..d88d570 100644 --- a/quantgov/estimator/structures.py +++ b/quantgov/estimator/structures.py @@ -130,6 +130,9 @@ def create_corpus(self, driver): for i in self.word_pattern.finditer(doc.text)]) for doc in driver.stream()] + def show_topics(self): + return self.model.gensim_model.show_topics() + def fit(self, driver, alpha=None, eta=None, num_topics=1, passes=1, min_wf=1): self.dictionary = Dictionary([[i.group(0).lower()