Skip to content

Commit

Permalink
Merge pull request #58 from QuantGov/release-0.5.0
Browse files Browse the repository at this point in the history
Release 0.5.0
  • Loading branch information
OliverSherouse committed Sep 28, 2018
2 parents 471d183 + 378f91b commit f0dbec0
Show file tree
Hide file tree
Showing 27 changed files with 848 additions and 415 deletions.
11 changes: 11 additions & 0 deletions Pipfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
[[source]]
url = "https://pypi.python.org/simple"
verify_ssl = true
name = "pypi"

[packages]
"e1839a8" = {path = ".", extras = ["nlp", "s3driver"], editable = true}

[dev-packages]
"pytest-flake8" = "*"
ipython = "*"
455 changes: 455 additions & 0 deletions Pipfile.lock

Large diffs are not rendered by default.

13 changes: 2 additions & 11 deletions quantgov/__init__.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,7 @@
from __future__ import (absolute_import, division, print_function,
unicode_literals)

__all__ = [
'corpora',
'corpus',
'estimator',
'project',
'utils',
]

from . import corpora # Backwards compatibility

from . import corpus, nlp, ml, utils
from .utils import load_driver

__version__ = '0.4.2'
__version__ = '0.5.0'
106 changes: 69 additions & 37 deletions quantgov/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,10 @@
import sys
import zipfile

import joblib as jl
import requests

import joblib as jl
import quantgov
import quantgov.corpus.builtins

from pathlib import Path

Expand All @@ -37,11 +36,11 @@ def parse_args():
create.add_argument('path', type=Path)
create.add_argument('--parent', default='master')

# Corpus command
corpus = subparsers.add_parser('corpus')
corpus_subcommands = corpus.add_subparsers(dest='subcommand')
for command, builtin in quantgov.corpus.builtins.commands.items():
subcommand = corpus_subcommands.add_parser(
# NLP command
nlp_subparser = subparsers.add_parser('nlp')
nlp_subcommands = nlp_subparser.add_subparsers(dest='subcommand')
for command, builtin in quantgov.nlp.commands.items():
subcommand = nlp_subcommands.add_parser(
command, help=builtin.cli.help)
subcommand.add_argument(
'corpus', help='Path to a QuantGov Corpus directory')
Expand All @@ -56,21 +55,24 @@ def parse_args():
default=sys.stdout
)

# Estimator Command
estimator = subparsers.add_parser('estimator')
estimator_subcommands = estimator.add_subparsers(dest='subcommand')
# ML Command
ml_parser = subparsers.add_parser('ml')
ml_subcommands = ml_parser.add_subparsers(dest='subcommand')

# Estimator Evaluate
evaluate = estimator_subcommands.add_parser(
# ML Evaluate
evaluate = ml_subcommands.add_parser(
'evaluate', help='Evaluate candidate models')
evaluate.add_argument(
'modeldefs', type=Path,
help='python module containing candidate models'
)
evaluate.add_argument(
'trainers', type=jl.load, help='saved Trainers object')
'trainers',
type=quantgov.ml.Trainers.load,
help='saved Trainers object'
)
evaluate.add_argument(
'labels', type=jl.load, help='saved Labels object')
'labels', type=quantgov.ml.Labels.load, help='saved Labels object')
evaluate.add_argument(
'output_results',
type=lambda x: open(x, 'w', encoding=ENCODE_OUT),
Expand All @@ -86,31 +88,36 @@ def parse_args():
help='Number of folds for cross-validation')
evaluate.add_argument('--scoring', default='f1', help='scoring method')

# Estimator Train
train = estimator_subcommands.add_parser('train', help='Train a model')
# ML Train
train = ml_subcommands.add_parser('train', help='Train a model')
train.add_argument(
'modeldefs', type=Path,
help='Python module containing candidate models'
)
train.add_argument('configfile', help='Model configuration file')
train.add_argument(
'trainers', type=jl.load, help='saved Trainers object')
'vectorizer',
type=jl.load,
help='saved Vectorizer object'
)
train.add_argument(
'trainers',
type=quantgov.ml.Trainers.load,
help='saved Trainers object'
)
train.add_argument(
'labels', type=jl.load, help='saved Labels object')
'labels', type=quantgov.ml.Labels.load, help='saved Labels object')
train.add_argument(
'-o', '--outfile', help='location to save the trained model'
'-o', '--outfile', help='location to save the trained Estimator'
)

# Estimator Estimate
estimate = estimator_subcommands.add_parser(
# ML Estimate
estimate = ml_subcommands.add_parser(
'estimate', help='Estimate label values for a target corpus')
estimate.add_argument(
'vectorizer', type=jl.load,
help='joblib-saved scikit-learn vectorizer'
)
estimate.add_argument(
'model', type=jl.load,
help='saved Model object'
'estimator',
type=quantgov.ml.Estimator.load,
help='saved Estimator object'
)
estimate.add_argument(
'corpus', type=quantgov.load_driver,
Expand Down Expand Up @@ -164,7 +171,7 @@ def start_component(args):
def run_corpus_builtin(args):
driver = quantgov.load_driver(args.corpus)
writer = csv.writer(args.outfile)
builtin = quantgov.corpus.builtins.commands[args.subcommand]
builtin = quantgov.nlp.commands[args.subcommand]
func_args = {i: j for i, j in vars(args).items()
if i not in {'command', 'subcommand', 'outfile', 'corpus'}}
writer.writerow(driver.index_labels + builtin.get_columns(func_args))
Expand All @@ -179,27 +186,52 @@ def run_corpus_builtin(args):

def run_estimator(args):
if args.subcommand == "evaluate":
quantgov.estimator.evaluate(
quantgov.ml.evaluate(
args.modeldefs, args.trainers, args.labels, args.folds,
args.scoring, args.output_results, args.output_suggestion
)
elif args.subcommand == "train":
quantgov.estimator.train_and_save_model(
args.modeldefs, args.configfile, args.trainers, args.labels,
args.outfile)
quantgov.ml.train_and_save_model(
args.modeldefs, args.configfile, args.vectorizer, args.trainers,
args.labels, args.outfile)
elif args.subcommand == "estimate":
quantgov.estimator.estimate(
args.vectorizer, args.model, args.corpus, args.probability,
args.precision, args.outfile
writer = csv.writer(args.outfile)
labels = args.corpus.index_labels
if args.probability:
if args.estimator.multilabel:
if args.estimator.multiclass:
writer.writerow(labels + ('label', 'class', 'probability'))
else:
writer.writerow(labels + ('label', 'probability'))
elif args.estimator.multiclass:
writer.writerow(labels + ('class', 'probability'))
else:
writer.writerow(
labels + ('{}_prob'.format(args.estimator.label_names[0]),)
)
else:
if args.estimator.multilabel:
writer.writerow(labels + ('label', 'prediction'))
else:
writer.writerow(
labels + ('{}'.format(args.estimator.label_names[0]),)
)
writer.writerows(
docidx + result for docidx,
result in quantgov.ml.estimate(
args.estimator,
args.corpus,
args.probability,
args.precision)
)


def main():
args = parse_args()
{
'start': start_component,
'corpus': run_corpus_builtin,
'estimator': run_estimator
'nlp': run_corpus_builtin,
'ml': run_estimator,
}[args.command](args)


Expand Down
18 changes: 0 additions & 18 deletions quantgov/corpora/__init__.py

This file was deleted.

13 changes: 3 additions & 10 deletions quantgov/corpus/structures.py → quantgov/corpus.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""
quantgov.corpora.structures
quantgov.corpus
Classes for Writing QuantGov Corpora
"""
Expand All @@ -13,7 +13,7 @@
from collections import namedtuple
from pathlib import Path

from .. import utils as qgutils
from . import utils as qgutils

try:
import boto3
Expand Down Expand Up @@ -286,17 +286,10 @@ def __init__(self, index, bucket, encoding='utf-8', cache=True):
super(IndexDriver, self).__init__(
index_labels=index_labels, encoding=encoding, cache=cache)

def gen_indices_and_paths(self):
with self.index.open(encoding=self.encoding) as inf:
reader = csv.reader(inf)
next(reader)
for row in reader:
yield tuple(row[:-1]), row[-1]

def read(self, docinfo):
idx, path = docinfo
body = self.client.get_object(Bucket=self.bucket,
Key=str(path))['Body']
Key=str(path).replace('\\', '/'))['Body']
return Document(idx, body.read().decode(self.encoding))

def filter(self, pattern):
Expand Down
11 changes: 0 additions & 11 deletions quantgov/corpus/__init__.py

This file was deleted.

Loading

0 comments on commit f0dbec0

Please sign in to comment.