Merge pull request #58 from QuantGov/release-0.5.0

Release 0.5.0
QuantGov · Sep 28, 2018 · f0dbec0 · f0dbec0
2 parents 471d183 + 378f91b
commit f0dbec0
Show file tree

Hide file tree

Showing 27 changed files with 848 additions and 415 deletions.
diff --git a/Pipfile b/Pipfile
@@ -0,0 +1,11 @@
+[[source]]
+url = "https://pypi.python.org/simple"
+verify_ssl = true
+name = "pypi"
+
+[packages]
+"e1839a8" = {path = ".", extras = ["nlp", "s3driver"], editable = true}
+
+[dev-packages]
+"pytest-flake8" = "*"
+ipython = "*"
diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/quantgov/__init__.py b/quantgov/__init__.py
@@ -1,16 +1,7 @@
 from __future__ import (absolute_import, division, print_function,
                         unicode_literals)
 
-__all__ = [
-    'corpora',
-    'corpus',
-    'estimator',
-    'project',
-    'utils',
-]
-
-from . import corpora  # Backwards compatibility
-
+from . import corpus, nlp, ml, utils
 from .utils import load_driver
 
-__version__ = '0.4.2'
+__version__ = '0.5.0'
diff --git a/quantgov/__main__.py b/quantgov/__main__.py
@@ -11,11 +11,10 @@
 import sys
 import zipfile
 
-import joblib as jl
 import requests
 
+import joblib as jl
 import quantgov
-import quantgov.corpus.builtins
 
 from pathlib import Path
 
@@ -37,11 +36,11 @@ def parse_args():
     create.add_argument('path', type=Path)
     create.add_argument('--parent', default='master')
 
-    # Corpus command
-    corpus = subparsers.add_parser('corpus')
-    corpus_subcommands = corpus.add_subparsers(dest='subcommand')
-    for command, builtin in quantgov.corpus.builtins.commands.items():
-        subcommand = corpus_subcommands.add_parser(
+    # NLP command
+    nlp_subparser = subparsers.add_parser('nlp')
+    nlp_subcommands = nlp_subparser.add_subparsers(dest='subcommand')
+    for command, builtin in quantgov.nlp.commands.items():
+        subcommand = nlp_subcommands.add_parser(
             command, help=builtin.cli.help)
         subcommand.add_argument(
             'corpus', help='Path to a QuantGov Corpus directory')
@@ -56,21 +55,24 @@ def parse_args():
             default=sys.stdout
         )
 
-    # Estimator Command
-    estimator = subparsers.add_parser('estimator')
-    estimator_subcommands = estimator.add_subparsers(dest='subcommand')
+    # ML Command
+    ml_parser = subparsers.add_parser('ml')
+    ml_subcommands = ml_parser.add_subparsers(dest='subcommand')
 
-    # Estimator Evaluate
-    evaluate = estimator_subcommands.add_parser(
+    # ML Evaluate
+    evaluate = ml_subcommands.add_parser(
         'evaluate', help='Evaluate candidate models')
     evaluate.add_argument(
         'modeldefs', type=Path,
         help='python module containing candidate models'
     )
     evaluate.add_argument(
-        'trainers', type=jl.load, help='saved Trainers object')
+        'trainers',
+        type=quantgov.ml.Trainers.load,
+        help='saved Trainers object'
+    )
     evaluate.add_argument(
-        'labels', type=jl.load, help='saved Labels object')
+        'labels', type=quantgov.ml.Labels.load, help='saved Labels object')
     evaluate.add_argument(
         'output_results',
         type=lambda x: open(x, 'w', encoding=ENCODE_OUT),
@@ -86,31 +88,36 @@ def parse_args():
         help='Number of folds for cross-validation')
     evaluate.add_argument('--scoring', default='f1', help='scoring method')
 
-    # Estimator Train
-    train = estimator_subcommands.add_parser('train', help='Train a model')
+    # ML Train
+    train = ml_subcommands.add_parser('train', help='Train a model')
     train.add_argument(
         'modeldefs', type=Path,
         help='Python module containing candidate models'
     )
     train.add_argument('configfile', help='Model configuration file')
     train.add_argument(
-        'trainers', type=jl.load, help='saved Trainers object')
+        'vectorizer',
+        type=jl.load,
+        help='saved Vectorizer object'
+    )
+    train.add_argument(
+        'trainers',
+        type=quantgov.ml.Trainers.load,
+        help='saved Trainers object'
+    )
     train.add_argument(
-        'labels', type=jl.load, help='saved Labels object')
+        'labels', type=quantgov.ml.Labels.load, help='saved Labels object')
     train.add_argument(
-        '-o', '--outfile', help='location to save the trained model'
+        '-o', '--outfile', help='location to save the trained Estimator'
     )
 
-    # Estimator Estimate
-    estimate = estimator_subcommands.add_parser(
+    # ML Estimate
+    estimate = ml_subcommands.add_parser(
         'estimate', help='Estimate label values for a target corpus')
     estimate.add_argument(
-        'vectorizer', type=jl.load,
-        help='joblib-saved scikit-learn vectorizer'
-    )
-    estimate.add_argument(
-        'model', type=jl.load,
-        help='saved Model object'
+        'estimator',
+        type=quantgov.ml.Estimator.load,
+        help='saved Estimator object'
     )
     estimate.add_argument(
         'corpus', type=quantgov.load_driver,
@@ -164,7 +171,7 @@ def start_component(args):
 def run_corpus_builtin(args):
     driver = quantgov.load_driver(args.corpus)
     writer = csv.writer(args.outfile)
-    builtin = quantgov.corpus.builtins.commands[args.subcommand]
+    builtin = quantgov.nlp.commands[args.subcommand]
     func_args = {i: j for i, j in vars(args).items()
                  if i not in {'command', 'subcommand', 'outfile', 'corpus'}}
     writer.writerow(driver.index_labels + builtin.get_columns(func_args))
@@ -179,27 +186,52 @@ def run_corpus_builtin(args):
 
 def run_estimator(args):
     if args.subcommand == "evaluate":
-        quantgov.estimator.evaluate(
+        quantgov.ml.evaluate(
             args.modeldefs, args.trainers, args.labels, args.folds,
             args.scoring, args.output_results, args.output_suggestion
         )
     elif args.subcommand == "train":
-        quantgov.estimator.train_and_save_model(
-            args.modeldefs, args.configfile, args.trainers, args.labels,
-            args.outfile)
+        quantgov.ml.train_and_save_model(
+            args.modeldefs, args.configfile, args.vectorizer, args.trainers,
+            args.labels, args.outfile)
     elif args.subcommand == "estimate":
-        quantgov.estimator.estimate(
-            args.vectorizer, args.model, args.corpus, args.probability,
-            args.precision, args.outfile
+        writer = csv.writer(args.outfile)
+        labels = args.corpus.index_labels
+        if args.probability:
+            if args.estimator.multilabel:
+                if args.estimator.multiclass:
+                    writer.writerow(labels + ('label', 'class', 'probability'))
+                else:
+                    writer.writerow(labels + ('label', 'probability'))
+            elif args.estimator.multiclass:
+                writer.writerow(labels + ('class', 'probability'))
+            else:
+                writer.writerow(
+                    labels + ('{}_prob'.format(args.estimator.label_names[0]),)
+                )
+        else:
+            if args.estimator.multilabel:
+                writer.writerow(labels + ('label', 'prediction'))
+            else:
+                writer.writerow(
+                    labels + ('{}'.format(args.estimator.label_names[0]),)
+                )
+        writer.writerows(
+            docidx + result for docidx,
+            result in quantgov.ml.estimate(
+                args.estimator,
+                args.corpus,
+                args.probability,
+                args.precision)
         )
 
 
 def main():
     args = parse_args()
     {
         'start': start_component,
-        'corpus': run_corpus_builtin,
-        'estimator': run_estimator
+        'nlp': run_corpus_builtin,
+        'ml': run_estimator,
     }[args.command](args)
 
 

diff --git a/quantgov/corpora/__init__.py b/quantgov/corpora/__init__.py
diff --git a/quantgov/corpus/structures.py → quantgov/corpus.py b/quantgov/corpus/structures.py → quantgov/corpus.py
@@ -1,5 +1,5 @@
 """
-quantgov.corpora.structures
+quantgov.corpus
 
 Classes for Writing QuantGov Corpora
 """
@@ -13,7 +13,7 @@
 from collections import namedtuple
 from pathlib import Path
 
-from .. import utils as qgutils
+from . import utils as qgutils
 
 try:
     import boto3
@@ -286,17 +286,10 @@ def __init__(self, index, bucket, encoding='utf-8', cache=True):
         super(IndexDriver, self).__init__(
             index_labels=index_labels, encoding=encoding, cache=cache)
 
-    def gen_indices_and_paths(self):
-        with self.index.open(encoding=self.encoding) as inf:
-            reader = csv.reader(inf)
-            next(reader)
-            for row in reader:
-                yield tuple(row[:-1]), row[-1]
-
     def read(self, docinfo):
         idx, path = docinfo
         body = self.client.get_object(Bucket=self.bucket,
-                                      Key=str(path))['Body']
+                                      Key=str(path).replace('\\', '/'))['Body']
         return Document(idx, body.read().decode(self.encoding))
 
     def filter(self, pattern):

diff --git a/quantgov/corpus/__init__.py b/quantgov/corpus/__init__.py