From b07786a2cbac31e5dbab744a31965dc8dfef8cee Mon Sep 17 00:00:00 2001
From: Michael Gasvoda <mgasvoda@mercatus.gmu.edu>
Date: Tue, 19 Dec 2017 14:09:42 -0500
Subject: [PATCH 01/16] modifying lda class structure

---
 quantgov/estimator/candidate_sets.py | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/quantgov/estimator/candidate_sets.py b/quantgov/estimator/candidate_sets.py
index 97978fa..84e0198 100644
--- a/quantgov/estimator/candidate_sets.py
+++ b/quantgov/estimator/candidate_sets.py
@@ -17,9 +17,24 @@
 import sklearn.multioutput
 import sklearn.pipeline
 import sklearn.feature_extraction
+from decorator import decorator
+from . import structures
+
+try:
+    import gensim
+except ImportError:
+    gensim = None
 
 import quantgov.estimator
 
+
+@decorator
+def check_gensim(func, *args, **kwargs):
+    if gensim is None:
+        raise RuntimeError('Must install gensim to use {}'.format(func))
+    return func(*args, **kwargs)
+
+
 classification = [
     quantgov.estimator.CandidateModel(
         name="Random Forests",
@@ -69,3 +84,16 @@
         }
     ),
 ]
+
+topic_modeling = [
+    quantgov.estimator.CandidateModel(
+        name="LDA",
+        model=sklearn.pipeline.Pipeline(steps=(
+            ('corpus creation', structures.TopicPreprocessor()),
+            ('lda', gensim.sklearn_api.ldamode.LdaTransformer(
+                # id2word=dictionary,
+                passes=1
+            )),
+        )),
+    ),
+]

From 257131d329e062ed4c9dce164a34f78eec5f1de0 Mon Sep 17 00:00:00 2001
From: Michael Gasvoda <mgasvoda@mercatus.gmu.edu>
Date: Tue, 19 Dec 2017 14:13:21 -0500
Subject: [PATCH 02/16] adjusting lda model, adding parameters for grid search

---
 quantgov/estimator/candidate_sets.py | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/quantgov/estimator/candidate_sets.py b/quantgov/estimator/candidate_sets.py
index 84e0198..e59b4a2 100644
--- a/quantgov/estimator/candidate_sets.py
+++ b/quantgov/estimator/candidate_sets.py
@@ -88,12 +88,11 @@ def check_gensim(func, *args, **kwargs):
 topic_modeling = [
     quantgov.estimator.CandidateModel(
         name="LDA",
-        model=sklearn.pipeline.Pipeline(steps=(
-            ('corpus creation', structures.TopicPreprocessor()),
-            ('lda', gensim.sklearn_api.ldamode.LdaTransformer(
-                # id2word=dictionary,
-                passes=1
-            )),
-        )),
+        model=structures.QGLdaModel(),
+        parameters={
+            'eta': [0.1, 0.05, 0.01],
+            'passes': [1, 2, 3],
+            'num_topics': [10, 50, 100]
+        }
     ),
 ]

From 265cc83b06c4c3f37510cecf6a614364b222a80d Mon Sep 17 00:00:00 2001
From: Michael Gasvoda <mgasvoda@mercatus.gmu.edu>
Date: Tue, 19 Dec 2017 14:13:42 -0500
Subject: [PATCH 03/16] adjusting lda model, adding parameters for grid search

---
 quantgov/estimator/structures.py | 62 ++++++++++++++++++++++++++++++++
 1 file changed, 62 insertions(+)

diff --git a/quantgov/estimator/structures.py b/quantgov/estimator/structures.py
index 8ef59ea..9dd89db 100644
--- a/quantgov/estimator/structures.py
+++ b/quantgov/estimator/structures.py
@@ -5,6 +5,33 @@
 """
 import collections
 import joblib as jl
+from sklearn.base import BaseEstimator, TransformerMixin
+from six import iteritems
+from decorator import decorator
+import re
+
+try:
+    from spacy.lang.en.stop_words import STOP_WORDS
+    from gensim.corpora import Dictionary
+    import gensim
+    spacy = True
+except ImportError:
+    spacy = None
+    gensim = None
+
+
+@decorator
+def check_spacy(func, *args, **kwargs):
+    if spacy is None:
+        raise RuntimeError('Must install spacy to use {}'.format(func))
+    return func(*args, **kwargs)
+
+
+@decorator
+def check_gensim(func, *args, **kwargs):
+    if gensim is None:
+        raise RuntimeError('Must install gensim to use {}'.format(func))
+    return func(*args, **kwargs)
 
 
 class _PersistanceMixin(object):
@@ -85,3 +112,38 @@ class CandidateModel(
             parameter values to test as values
     """
     pass
+
+
+class QGLdaModel(BaseEstimator, TransformerMixin):
+    def __init__(self, word_regex=r'\b[A-z]{2,}\b', stop_words=STOP_WORDS):
+        self.stop_words = stop_words
+        self.word_regex = re.compile(word_regex)
+
+    def transform(self, driver):
+        return self.model.transform(driver.stream)
+
+    def create_corpus(self, driver):
+        return [self.dictionary.doc2bow([i.group(0)
+                for i in self.word_regex.finditer(doc.text)])
+                for doc in driver.stream()]
+
+    @check_gensim
+    @check_spacy
+    def fit(self, driver, alpha=None, eta=None, num_topics=None, passes=None):
+        self.dictionary = Dictionary([[i.group(0)
+                                      for i in self.word_regex
+                                        .finditer(doc.text)]
+                                      for doc in driver.stream()])
+        stop_ids = [self.dictionary.token2id[stopword] for stopword
+                    in self.stop_words if stopword in self.dictionary.token2id]
+        once_ids = [tokenid for tokenid, docfreq in
+                    iteritems(self.dictionary.dfs) if docfreq == 1]
+        self.dictionary.filter_tokens(stop_ids + once_ids)
+        self.corpus = self.create_corpus(driver)
+        self.model = gensim.models.ldamodel.LdaModel(self.corpus,
+                                                     id2word=self.dictionary,
+                                                     alpha=alpha
+                                                     eta=eta,
+                                                     num_topics=num_topics,
+                                                     passes=passes)
+        return self

From bb5f0369f91305dce8b0b588c848d950d6d7cfca Mon Sep 17 00:00:00 2001
From: Michael Gasvoda <mgasvoda@mercatus.gmu.edu>
Date: Tue, 19 Dec 2017 14:14:31 -0500
Subject: [PATCH 04/16] adding topic modeling dependencies

---
 setup.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/setup.py b/setup.py
index 3d424b1..910fde8 100644
--- a/setup.py
+++ b/setup.py
@@ -65,6 +65,10 @@ def find_version(*file_paths):
         'nlp': [
             'textblob',
             'nltk',
+        ],
+        'topic_modeling': [
+            'gensim',
+            'spacy'
         ]
     },
     entry_points={

From 2a15b324b20475de69c80e82064e2826c8096fc0 Mon Sep 17 00:00:00 2001
From: Michael Gasvoda <mgasvoda@mercatus.gmu.edu>
Date: Tue, 19 Dec 2017 14:19:49 -0500
Subject: [PATCH 05/16] fixing syntax

---
 quantgov/estimator/structures.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/quantgov/estimator/structures.py b/quantgov/estimator/structures.py
index 9dd89db..199d9d0 100644
--- a/quantgov/estimator/structures.py
+++ b/quantgov/estimator/structures.py
@@ -142,7 +142,7 @@ def fit(self, driver, alpha=None, eta=None, num_topics=None, passes=None):
         self.corpus = self.create_corpus(driver)
         self.model = gensim.models.ldamodel.LdaModel(self.corpus,
                                                      id2word=self.dictionary,
-                                                     alpha=alpha
+                                                     alpha=alpha,
                                                      eta=eta,
                                                      num_topics=num_topics,
                                                      passes=passes)

From c3faebfff500fc50d8ecf1b1c0bf569076d03c91 Mon Sep 17 00:00:00 2001
From: Michael Gasvoda <mgasvoda@mercatus.gmu.edu>
Date: Tue, 19 Dec 2017 16:32:44 -0500
Subject: [PATCH 06/16] lowercasing words

---
 quantgov/estimator/structures.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/quantgov/estimator/structures.py b/quantgov/estimator/structures.py
index 199d9d0..816b6e5 100644
--- a/quantgov/estimator/structures.py
+++ b/quantgov/estimator/structures.py
@@ -123,14 +123,14 @@ def transform(self, driver):
         return self.model.transform(driver.stream)
 
     def create_corpus(self, driver):
-        return [self.dictionary.doc2bow([i.group(0)
+        return [self.dictionary.doc2bow([i.group(0).lower()
                 for i in self.word_regex.finditer(doc.text)])
                 for doc in driver.stream()]
 
     @check_gensim
     @check_spacy
     def fit(self, driver, alpha=None, eta=None, num_topics=None, passes=None):
-        self.dictionary = Dictionary([[i.group(0)
+        self.dictionary = Dictionary([[i.group(0).lower()
                                       for i in self.word_regex
                                         .finditer(doc.text)]
                                       for doc in driver.stream()])

From c6b764d9809ba4d4102ddafa50889e1188c7ba8a Mon Sep 17 00:00:00 2001
From: Michael Gasvoda <mgasvoda@mercatus.gmu.edu>
Date: Tue, 2 Jan 2018 12:35:35 -0500
Subject: [PATCH 07/16] adding sklearn api version

---
 quantgov/estimator/structures.py | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/quantgov/estimator/structures.py b/quantgov/estimator/structures.py
index 816b6e5..9aad0f3 100644
--- a/quantgov/estimator/structures.py
+++ b/quantgov/estimator/structures.py
@@ -13,6 +13,7 @@
 try:
     from spacy.lang.en.stop_words import STOP_WORDS
     from gensim.corpora import Dictionary
+    from gensim import sklearn_api
     import gensim
     spacy = True
 except ImportError:
@@ -115,20 +116,21 @@ class CandidateModel(
 
 
 class QGLdaModel(BaseEstimator, TransformerMixin):
+    @check_gensim
+    @check_spacy
     def __init__(self, word_regex=r'\b[A-z]{2,}\b', stop_words=STOP_WORDS):
         self.stop_words = stop_words
         self.word_regex = re.compile(word_regex)
 
     def transform(self, driver):
-        return self.model.transform(driver.stream)
+        self.test_corpus = self.create_corpus(driver)
+        return self.model.transform(self.test_corpus)
 
     def create_corpus(self, driver):
         return [self.dictionary.doc2bow([i.group(0).lower()
                 for i in self.word_regex.finditer(doc.text)])
                 for doc in driver.stream()]
 
-    @check_gensim
-    @check_spacy
     def fit(self, driver, alpha=None, eta=None, num_topics=None, passes=None):
         self.dictionary = Dictionary([[i.group(0).lower()
                                       for i in self.word_regex
@@ -140,10 +142,12 @@ def fit(self, driver, alpha=None, eta=None, num_topics=None, passes=None):
                     iteritems(self.dictionary.dfs) if docfreq == 1]
         self.dictionary.filter_tokens(stop_ids + once_ids)
         self.corpus = self.create_corpus(driver)
-        self.model = gensim.models.ldamodel.LdaModel(self.corpus,
-                                                     id2word=self.dictionary,
-                                                     alpha=alpha,
-                                                     eta=eta,
-                                                     num_topics=num_topics,
-                                                     passes=passes)
+        self.model = sklearn_api.ldamodel.LdaTransformer(
+            alpha=alpha,
+            eta=eta,
+            num_topics=num_topics,
+            passes=passes,
+            id2word=self.dictionary
+        )
+        self.model.fit(self.corpus)
         return self

From a17ed349c7f8bdd67caa88b1301caf46ed7c83cb Mon Sep 17 00:00:00 2001
From: Michael Gasvoda <mgasvoda@mercatus.gmu.edu>
Date: Tue, 2 Jan 2018 12:36:25 -0500
Subject: [PATCH 08/16] initial commit testing topic model

---
 tests/test_estimators.py | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)
 create mode 100644 tests/test_estimators.py

diff --git a/tests/test_estimators.py b/tests/test_estimators.py
new file mode 100644
index 0000000..5b099aa
--- /dev/null
+++ b/tests/test_estimators.py
@@ -0,0 +1,27 @@
+# import pytest
+import subprocess
+import quantgov.estimator
+import quantgov
+
+from pathlib import Path
+
+PSEUDO_CORPUS_PATH = Path(__file__).resolve().parent.joinpath('pseudo_corpus')
+driver = quantgov.load_driver(PSEUDO_CORPUS_PATH)
+# models = quantgov.estimator.utils.load_models('./sample_models.py')
+
+
+# def test_all_model_evaluation():
+#     quantgov.estimator.evaluation.evaluate_all_models(models, X, y, 2, 'f1')
+
+
+def test_topic_model():
+    sample = quantgov.estimator.structures.QGLdaModel()
+    sample.fit(driver)
+    sample.transform(driver)
+
+
+def check_output(cmd):
+    return (
+        subprocess.check_output(cmd, universal_newlines=True)
+        .replace('\n\n', '\n')
+    )

From d5592f0d829801bf302c3652ad8c38c6354316c3 Mon Sep 17 00:00:00 2001
From: Michael Gasvoda <mgasvoda@mercatus.gmu.edu>
Date: Tue, 2 Jan 2018 12:37:14 -0500
Subject: [PATCH 09/16] loading all models for testing

---
 tests/sample_models.py | 98 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 98 insertions(+)
 create mode 100644 tests/sample_models.py

diff --git a/tests/sample_models.py b/tests/sample_models.py
new file mode 100644
index 0000000..38be9a5
--- /dev/null
+++ b/tests/sample_models.py
@@ -0,0 +1,98 @@
+"""
+quantgov.estimator.candidate_sets: Starter model candidate sets
+
+
+This module provides a few sample sets of models for common problems. These are
+mostly helpful for initial analysis; in general, you will want to customize
+these.
+
+The currently included candidates sets are:
+    * `classificaiton`: Random Forests and Logit with TF-IDF preprocessor
+    * `multilabel_classificaiton`: same as classification, with the Logit
+        classifier wrapped in a MultiOutputClassifier
+"""
+import numpy as np
+import sklearn.ensemble
+import sklearn.linear_model
+import sklearn.multioutput
+import sklearn.pipeline
+import sklearn.feature_extraction
+from decorator import decorator
+# from . import structures
+
+try:
+    import gensim
+except ImportError:
+    gensim = None
+
+import quantgov.estimator
+
+
+# @decorator
+# def check_gensim(func, *args, **kwargs):
+#     if gensim is None:
+#         raise RuntimeError('Must install gensim to use {}'.format(func))
+#     return func(*args, **kwargs)
+
+
+models = [
+    quantgov.estimator.CandidateModel(
+        name="Random Forests",
+        model=sklearn.pipeline.Pipeline(steps=(
+            ('tfidf', sklearn.feature_extraction.text.TfidfTransformer()),
+            ('rf', sklearn.ensemble.RandomForestClassifier(n_jobs=-1)),
+        )),
+        parameters={
+            'rf__n_estimators': [5, 10, 25, 50, 100],
+        }
+    ),
+    quantgov.estimator.CandidateModel(
+        name="Logistic Regression",
+        model=sklearn.pipeline.Pipeline(steps=(
+            ('tfidf', sklearn.feature_extraction.text.TfidfTransformer()),
+            ('logit', sklearn.linear_model.LogisticRegression()),
+        )),
+        parameters={
+            'logit__C': np.logspace(-2, 2, 5)
+        }
+    ),
+]
+
+
+multilabel_classification = [
+    quantgov.estimator.CandidateModel(
+        name="Random Forests",
+        model=sklearn.pipeline.Pipeline(steps=(
+            ('tfidf', sklearn.feature_extraction.text.TfidfTransformer()),
+            ('rf', sklearn.ensemble.RandomForestClassifier(n_jobs=-1)),
+        )),
+        parameters={
+            'rf__n_estimators': [5, 10, 25, 50, 100],
+        }
+    ),
+    quantgov.estimator.CandidateModel(
+        name="Logistic Regression",
+        model=sklearn.pipeline.Pipeline(steps=(
+            ('tfidf', sklearn.feature_extraction.text.TfidfTransformer()),
+            ('logit', sklearn.multioutput.MultiOutputClassifier(
+                sklearn.linear_model.LogisticRegression(),
+                n_jobs=-1
+            )),
+        )),
+        parameters={
+            'logit__estimator__C': np.logspace(-2, 2, 5)
+        }
+    ),
+]
+
+topic_modeling = [
+    quantgov.estimator.CandidateModel(
+        name="LDA",
+        model=quantgov.estimator.structures.QGLdaModel(),
+        parameters={
+            'eta': [0.1, 0.05, 0.01],
+            'passes': [1, 2, 3],
+            'num_topics': [10, 50, 100]
+        }
+    ),
+]

From 58513f7cd00e074d03e3b9f8acb494e70d188b8f Mon Sep 17 00:00:00 2001
From: Michael Gasvoda <mgasvoda@mercatus.gmu.edu>
Date: Tue, 2 Jan 2018 13:03:18 -0500
Subject: [PATCH 10/16] passing topic model test

---
 quantgov/estimator/structures.py |  2 +-
 tests/sample_models.py           | 98 --------------------------------
 tests/test_estimators.py         |  7 +--
 3 files changed, 2 insertions(+), 105 deletions(-)
 delete mode 100644 tests/sample_models.py

diff --git a/quantgov/estimator/structures.py b/quantgov/estimator/structures.py
index 9aad0f3..47fa7e9 100644
--- a/quantgov/estimator/structures.py
+++ b/quantgov/estimator/structures.py
@@ -131,7 +131,7 @@ def create_corpus(self, driver):
                 for i in self.word_regex.finditer(doc.text)])
                 for doc in driver.stream()]
 
-    def fit(self, driver, alpha=None, eta=None, num_topics=None, passes=None):
+    def fit(self, driver, alpha=None, eta=None, num_topics=1, passes=1):
         self.dictionary = Dictionary([[i.group(0).lower()
                                       for i in self.word_regex
                                         .finditer(doc.text)]
diff --git a/tests/sample_models.py b/tests/sample_models.py
deleted file mode 100644
index 38be9a5..0000000
--- a/tests/sample_models.py
+++ /dev/null
@@ -1,98 +0,0 @@
-"""
-quantgov.estimator.candidate_sets: Starter model candidate sets
-
-
-This module provides a few sample sets of models for common problems. These are
-mostly helpful for initial analysis; in general, you will want to customize
-these.
-
-The currently included candidates sets are:
-    * `classificaiton`: Random Forests and Logit with TF-IDF preprocessor
-    * `multilabel_classificaiton`: same as classification, with the Logit
-        classifier wrapped in a MultiOutputClassifier
-"""
-import numpy as np
-import sklearn.ensemble
-import sklearn.linear_model
-import sklearn.multioutput
-import sklearn.pipeline
-import sklearn.feature_extraction
-from decorator import decorator
-# from . import structures
-
-try:
-    import gensim
-except ImportError:
-    gensim = None
-
-import quantgov.estimator
-
-
-# @decorator
-# def check_gensim(func, *args, **kwargs):
-#     if gensim is None:
-#         raise RuntimeError('Must install gensim to use {}'.format(func))
-#     return func(*args, **kwargs)
-
-
-models = [
-    quantgov.estimator.CandidateModel(
-        name="Random Forests",
-        model=sklearn.pipeline.Pipeline(steps=(
-            ('tfidf', sklearn.feature_extraction.text.TfidfTransformer()),
-            ('rf', sklearn.ensemble.RandomForestClassifier(n_jobs=-1)),
-        )),
-        parameters={
-            'rf__n_estimators': [5, 10, 25, 50, 100],
-        }
-    ),
-    quantgov.estimator.CandidateModel(
-        name="Logistic Regression",
-        model=sklearn.pipeline.Pipeline(steps=(
-            ('tfidf', sklearn.feature_extraction.text.TfidfTransformer()),
-            ('logit', sklearn.linear_model.LogisticRegression()),
-        )),
-        parameters={
-            'logit__C': np.logspace(-2, 2, 5)
-        }
-    ),
-]
-
-
-multilabel_classification = [
-    quantgov.estimator.CandidateModel(
-        name="Random Forests",
-        model=sklearn.pipeline.Pipeline(steps=(
-            ('tfidf', sklearn.feature_extraction.text.TfidfTransformer()),
-            ('rf', sklearn.ensemble.RandomForestClassifier(n_jobs=-1)),
-        )),
-        parameters={
-            'rf__n_estimators': [5, 10, 25, 50, 100],
-        }
-    ),
-    quantgov.estimator.CandidateModel(
-        name="Logistic Regression",
-        model=sklearn.pipeline.Pipeline(steps=(
-            ('tfidf', sklearn.feature_extraction.text.TfidfTransformer()),
-            ('logit', sklearn.multioutput.MultiOutputClassifier(
-                sklearn.linear_model.LogisticRegression(),
-                n_jobs=-1
-            )),
-        )),
-        parameters={
-            'logit__estimator__C': np.logspace(-2, 2, 5)
-        }
-    ),
-]
-
-topic_modeling = [
-    quantgov.estimator.CandidateModel(
-        name="LDA",
-        model=quantgov.estimator.structures.QGLdaModel(),
-        parameters={
-            'eta': [0.1, 0.05, 0.01],
-            'passes': [1, 2, 3],
-            'num_topics': [10, 50, 100]
-        }
-    ),
-]
diff --git a/tests/test_estimators.py b/tests/test_estimators.py
index 5b099aa..94af230 100644
--- a/tests/test_estimators.py
+++ b/tests/test_estimators.py
@@ -7,16 +7,11 @@
 
 PSEUDO_CORPUS_PATH = Path(__file__).resolve().parent.joinpath('pseudo_corpus')
 driver = quantgov.load_driver(PSEUDO_CORPUS_PATH)
-# models = quantgov.estimator.utils.load_models('./sample_models.py')
-
-
-# def test_all_model_evaluation():
-#     quantgov.estimator.evaluation.evaluate_all_models(models, X, y, 2, 'f1')
 
 
 def test_topic_model():
     sample = quantgov.estimator.structures.QGLdaModel()
-    sample.fit(driver)
+    sample.fit(driver, num_topics=2)
     sample.transform(driver)
 
 

From aac7d98b0f349c1e47a756719e0c74d20aa3064e Mon Sep 17 00:00:00 2001
From: Michael Gasvoda <mgasvoda@mercatus.gmu.edu>
Date: Tue, 2 Jan 2018 13:08:46 -0500
Subject: [PATCH 11/16] updating travis config dependencies

---
 .travis.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.travis.yml b/.travis.yml
index 75cc0ab..1833fc7 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -5,6 +5,7 @@ python:
 install:
 - pip install ".[testing]"
 - pip install ".[nlp]"
+- pip install ".[topic_modeling]"
 - python -m nltk.downloader punkt stopwords wordnet
 script: pytest
 deploy:

From ef9853f0f6d1638d6c50c73aea5bc9d3d3864e1b Mon Sep 17 00:00:00 2001
From: Michael Gasvoda <mgasvoda@mercatus.gmu.edu>
Date: Tue, 2 Jan 2018 13:18:42 -0500
Subject: [PATCH 12/16] checking import before initializing

---
 quantgov/estimator/candidate_sets.py | 37 ++++++++++++++--------------
 1 file changed, 18 insertions(+), 19 deletions(-)

diff --git a/quantgov/estimator/candidate_sets.py b/quantgov/estimator/candidate_sets.py
index e59b4a2..7e88942 100644
--- a/quantgov/estimator/candidate_sets.py
+++ b/quantgov/estimator/candidate_sets.py
@@ -17,24 +17,22 @@
 import sklearn.multioutput
 import sklearn.pipeline
 import sklearn.feature_extraction
-from decorator import decorator
 from . import structures
 
 try:
     import gensim
 except ImportError:
     gensim = None
+try:
+    import gensim
+    import spacy
+except ImportError:
+    spacy = None
+    gensim = None
 
 import quantgov.estimator
 
 
-@decorator
-def check_gensim(func, *args, **kwargs):
-    if gensim is None:
-        raise RuntimeError('Must install gensim to use {}'.format(func))
-    return func(*args, **kwargs)
-
-
 classification = [
     quantgov.estimator.CandidateModel(
         name="Random Forests",
@@ -85,14 +83,15 @@ def check_gensim(func, *args, **kwargs):
     ),
 ]
 
-topic_modeling = [
-    quantgov.estimator.CandidateModel(
-        name="LDA",
-        model=structures.QGLdaModel(),
-        parameters={
-            'eta': [0.1, 0.05, 0.01],
-            'passes': [1, 2, 3],
-            'num_topics': [10, 50, 100]
-        }
-    ),
-]
+if gensim and spacy:
+    topic_modeling = [
+        quantgov.estimator.CandidateModel(
+            name="LDA",
+            model=structures.QGLdaModel(),
+            parameters={
+                'eta': [0.1, 0.05, 0.01],
+                'passes': [1, 2, 3],
+                'num_topics': [10, 50, 100]
+            }
+        ),
+    ]

From c97c6dfc63fe88ecb8857b211b8783603ae43648 Mon Sep 17 00:00:00 2001
From: Michael Gasvoda <mgasvoda@mercatus.gmu.edu>
Date: Thu, 18 Jan 2018 10:25:35 -0500
Subject: [PATCH 13/16] adjusting dependencies, variable names

---
 quantgov/estimator/candidate_sets.py |  2 +-
 quantgov/estimator/structures.py     | 21 +++++++--------------
 tests/test_estimators.py             |  2 +-
 3 files changed, 9 insertions(+), 16 deletions(-)

diff --git a/quantgov/estimator/candidate_sets.py b/quantgov/estimator/candidate_sets.py
index 7e88942..38e6813 100644
--- a/quantgov/estimator/candidate_sets.py
+++ b/quantgov/estimator/candidate_sets.py
@@ -87,7 +87,7 @@
     topic_modeling = [
         quantgov.estimator.CandidateModel(
             name="LDA",
-            model=structures.QGLdaModel(),
+            model=structures.GensimLda(),
             parameters={
                 'eta': [0.1, 0.05, 0.01],
                 'passes': [1, 2, 3],
diff --git a/quantgov/estimator/structures.py b/quantgov/estimator/structures.py
index 47fa7e9..000e48f 100644
--- a/quantgov/estimator/structures.py
+++ b/quantgov/estimator/structures.py
@@ -11,21 +11,15 @@
 import re
 
 try:
-    from spacy.lang.en.stop_words import STOP_WORDS
     from gensim.corpora import Dictionary
     from gensim import sklearn_api
     import gensim
-    spacy = True
 except ImportError:
-    spacy = None
     gensim = None
 
 
-@decorator
-def check_spacy(func, *args, **kwargs):
-    if spacy is None:
-        raise RuntimeError('Must install spacy to use {}'.format(func))
-    return func(*args, **kwargs)
+from sklearn.feature_extraction import stop_words
+STOP_WORDS = stop_words.ENGLISH_STOP_WORDS
 
 
 @decorator
@@ -115,12 +109,11 @@ class CandidateModel(
     pass
 
 
-class QGLdaModel(BaseEstimator, TransformerMixin):
+class GensimLda(BaseEstimator, TransformerMixin):
     @check_gensim
-    @check_spacy
-    def __init__(self, word_regex=r'\b[A-z]{2,}\b', stop_words=STOP_WORDS):
+    def __init__(self, word_pattern=r'\b[A-z]{2,}\b', stop_words=STOP_WORDS):
         self.stop_words = stop_words
-        self.word_regex = re.compile(word_regex)
+        self.word_pattern = re.compile(word_pattern)
 
     def transform(self, driver):
         self.test_corpus = self.create_corpus(driver)
@@ -128,12 +121,12 @@ def transform(self, driver):
 
     def create_corpus(self, driver):
         return [self.dictionary.doc2bow([i.group(0).lower()
-                for i in self.word_regex.finditer(doc.text)])
+                for i in self.word_pattern.finditer(doc.text)])
                 for doc in driver.stream()]
 
     def fit(self, driver, alpha=None, eta=None, num_topics=1, passes=1):
         self.dictionary = Dictionary([[i.group(0).lower()
-                                      for i in self.word_regex
+                                      for i in self.word_pattern
                                         .finditer(doc.text)]
                                       for doc in driver.stream()])
         stop_ids = [self.dictionary.token2id[stopword] for stopword
diff --git a/tests/test_estimators.py b/tests/test_estimators.py
index 94af230..2b52c5d 100644
--- a/tests/test_estimators.py
+++ b/tests/test_estimators.py
@@ -10,7 +10,7 @@
 
 
 def test_topic_model():
-    sample = quantgov.estimator.structures.QGLdaModel()
+    sample = quantgov.estimator.structures.GensimLda()
     sample.fit(driver, num_topics=2)
     sample.transform(driver)
 

From aaffd9419caa0d364573bd1d38761984eeccd8fe Mon Sep 17 00:00:00 2001
From: Michael Gasvoda <mgasvoda@mercatus.gmu.edu>
Date: Thu, 18 Jan 2018 10:31:16 -0500
Subject: [PATCH 14/16] moving stopword removal

---
 quantgov/estimator/structures.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/quantgov/estimator/structures.py b/quantgov/estimator/structures.py
index 000e48f..992a3ed 100644
--- a/quantgov/estimator/structures.py
+++ b/quantgov/estimator/structures.py
@@ -127,13 +127,12 @@ def create_corpus(self, driver):
     def fit(self, driver, alpha=None, eta=None, num_topics=1, passes=1):
         self.dictionary = Dictionary([[i.group(0).lower()
                                       for i in self.word_pattern
-                                        .finditer(doc.text)]
+                                        .finditer(doc.text)
+                                       if i not in self.stop_words]
                                       for doc in driver.stream()])
-        stop_ids = [self.dictionary.token2id[stopword] for stopword
-                    in self.stop_words if stopword in self.dictionary.token2id]
         once_ids = [tokenid for tokenid, docfreq in
                     iteritems(self.dictionary.dfs) if docfreq == 1]
-        self.dictionary.filter_tokens(stop_ids + once_ids)
+        self.dictionary.filter_tokens(once_ids)
         self.corpus = self.create_corpus(driver)
         self.model = sklearn_api.ldamodel.LdaTransformer(
             alpha=alpha,

From 58db43ab23c01b0840a379433b4d5ba7724fd7df Mon Sep 17 00:00:00 2001
From: Michael Gasvoda <mgasvoda@mercatus.gmu.edu>
Date: Thu, 18 Jan 2018 10:46:11 -0500
Subject: [PATCH 15/16] restructuring stopword args, adjusting min word freq

---
 quantgov/estimator/structures.py | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/quantgov/estimator/structures.py b/quantgov/estimator/structures.py
index 992a3ed..1f32bc6 100644
--- a/quantgov/estimator/structures.py
+++ b/quantgov/estimator/structures.py
@@ -111,8 +111,14 @@ class CandidateModel(
 
 class GensimLda(BaseEstimator, TransformerMixin):
     @check_gensim
-    def __init__(self, word_pattern=r'\b[A-z]{2,}\b', stop_words=STOP_WORDS):
-        self.stop_words = stop_words
+    def __init__(self, word_pattern=r'\b[A-z]{2,}\b', stop_words='en'):
+        if stop_words == 'en':
+            self.stop_words = STOP_WORDS
+        elif not stop_words:
+            self.stop_words = None
+        else:
+            self.stop_words = stop_words
+
         self.word_pattern = re.compile(word_pattern)
 
     def transform(self, driver):
@@ -124,15 +130,17 @@ def create_corpus(self, driver):
                 for i in self.word_pattern.finditer(doc.text)])
                 for doc in driver.stream()]
 
-    def fit(self, driver, alpha=None, eta=None, num_topics=1, passes=1):
+    def fit(self, driver, alpha=None, eta=None, num_topics=1,
+            passes=1, min_wf=1):
         self.dictionary = Dictionary([[i.group(0).lower()
                                       for i in self.word_pattern
-                                        .finditer(doc.text)
-                                       if i not in self.stop_words]
+                                        .finditer(doc.text)]
                                       for doc in driver.stream()])
+        stop_ids = [self.dictionary.token2id[stopword] for stopword
+                    in self.stop_words if stopword in self.dictionary.token2id]
         once_ids = [tokenid for tokenid, docfreq in
-                    iteritems(self.dictionary.dfs) if docfreq == 1]
-        self.dictionary.filter_tokens(once_ids)
+                    iteritems(self.dictionary.dfs) if docfreq <= min_wf]
+        self.dictionary.filter_tokens(stop_ids + once_ids)
         self.corpus = self.create_corpus(driver)
         self.model = sklearn_api.ldamodel.LdaTransformer(
             alpha=alpha,

From 30af4c2acb9d22752a382e63e857489d30f0b24d Mon Sep 17 00:00:00 2001
From: Michael Gasvoda <mgasvoda@mercatus.gmu.edu>
Date: Thu, 1 Mar 2018 14:45:30 -0500
Subject: [PATCH 16/16] providing wrapper for show_topics

---
 quantgov/estimator/structures.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/quantgov/estimator/structures.py b/quantgov/estimator/structures.py
index 1f32bc6..d88d570 100644
--- a/quantgov/estimator/structures.py
+++ b/quantgov/estimator/structures.py
@@ -130,6 +130,9 @@ def create_corpus(self, driver):
                 for i in self.word_pattern.finditer(doc.text)])
                 for doc in driver.stream()]
 
+    def show_topics(self):
+        return self.model.gensim_model.show_topics()
+
     def fit(self, driver, alpha=None, eta=None, num_topics=1,
             passes=1, min_wf=1):
         self.dictionary = Dictionary([[i.group(0).lower()