-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Nana Mensah
authored and
Nana Mensah
committed
Sep 6, 2019
1 parent
106d9fa
commit e58640f
Showing
7 changed files
with
426 additions
and
0 deletions.
There are no files selected for viewing
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
"""clean.py | ||
Drop and encode features specified in config.json. | ||
""" | ||
|
||
import os | ||
import sys | ||
|
||
import argparse | ||
import category_encoders as ce | ||
import pandas as pd | ||
|
||
from src.log import Logger | ||
log = Logger('clean') | ||
|
||
class Cleaner(): | ||
"""Drop and encode columns in a dataframe. Additionally, drops rows with NA values. | ||
Args: | ||
data (pd.Dataframe): Dataframe containing data | ||
to_drop (List): Column names to drop | ||
to_encode (List): Column names to one-hot encode | ||
Methods: | ||
process: Iterates over class methods applying dataframe | ||
encode: One-hot encodes features | ||
drop_columns: Remove features passed at class initialisation | ||
drop_na_rows: Remove any rows with empty cells | ||
""" | ||
def __init__(self, data, to_drop, to_encode): | ||
self.data = data | ||
self.to_drop = to_drop | ||
self.to_encode = to_encode | ||
# Set list of methods to p | ||
self.operations = [self.encode, self.drop_columns, self.drop_na_rows] | ||
|
||
def process(self): | ||
df_process = self.data.copy(deep=True) | ||
for operation in self.operations: | ||
df_process = operation(df_process) | ||
return df_process | ||
|
||
def encode(self, df): | ||
encodable = set(self.to_encode).intersection(set(df.columns)) | ||
if not encodable: # No columns to encode | ||
return df | ||
else: | ||
encoder = ce.OneHotEncoder(cols=list(encodable), use_cat_names=True, handle_unknown='ignore', return_df=True) | ||
encoded = encoder.fit_transform(df) | ||
return encoded | ||
|
||
def drop_columns(self, df): | ||
return df.drop(columns=self.to_drop, errors='ignore') | ||
|
||
def drop_na_rows(self, df): | ||
return df.dropna(axis='index', how='any') | ||
|
||
|
||
def main(): | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument('--input') | ||
parser.add_argument('--drop', nargs='+') | ||
parser.add_argument('--encode', nargs='+') | ||
args = parser.parse_args() | ||
|
||
# Read data | ||
log.info('BEGIN') | ||
indf = pd.read_csv(args.input, index_col=0) | ||
log.info(f'Input shape {indf.shape}') | ||
|
||
# Clean data | ||
cleaner = Cleaner(data=indf, to_drop=args.drop, to_encode=args.encode) | ||
cleaned = cleaner.process() | ||
|
||
# Assert only one column contains strings. This will be the target, | ||
# used later to split the data. | ||
string_cols = cleaned.select_dtypes('object').columns | ||
log.debug(string_cols) | ||
assert (len(string_cols)) == 1 | ||
|
||
# Write data | ||
output = os.path.join('cleaned.csv') | ||
cleaned.to_csv(output) | ||
log.info(f'Output shape {cleaned.shape}') | ||
log.info('END') | ||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,110 @@ | ||
"""evaluate.py - Evaluate model against test data""" | ||
|
||
import sys | ||
import os | ||
from src.log import Logger | ||
log = Logger('evaluate') | ||
|
||
import argparse | ||
import pandas as pd | ||
import numpy as np | ||
import joblib | ||
|
||
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score, matthews_corrcoef | ||
from sklearn.metrics import roc_curve, precision_recall_curve | ||
from sklearn.calibration import CalibratedClassifierCV | ||
|
||
def get_training_data(cli_args): | ||
"""Load the training dataset. | ||
Args: | ||
cli_args: Argparse object with command line arguments | ||
""" | ||
args = cli_args | ||
training = pd.read_csv(args.training, index_col=0, dtype=np.float64) | ||
X_train = training.drop(columns=[args.target]).to_numpy() | ||
y_train = training[args.target].to_numpy() | ||
return X_train, y_train | ||
|
||
def model_with_proba(model, cli_args): | ||
"""Return a model with the predict_proba method. A wrapper to catch models that do not implement | ||
this method by default. | ||
Args: | ||
model: An sklearn estimator object | ||
Returns: | ||
model: An sklearn estimator object with the predict_proba() method | ||
""" | ||
# Models without probabilities to check | ||
known = ['LinearSVC'] | ||
# Return models with predict_proba | ||
if hasattr(model, 'predict_proba'): | ||
return model | ||
# Wrap model with calibrator for probability prediction if it is in the known list | ||
# Do not refit model, simply calibrate internal data. | ||
elif model.__class__.__name__ in known: | ||
# Wrap input model with calibrator | ||
calib_model = CalibratedClassifierCV(base_estimator=model, cv="prefit") | ||
# Recalibrate on training data | ||
X_train, y_train = get_training_data(cli_args) | ||
calib_model.fit(X_train, y_train) | ||
return calib_model | ||
else: | ||
raise ValueError(f'Model is not in known list and does not have predict_proba() method') | ||
|
||
def main(): | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument('--test') | ||
parser.add_argument('--target') | ||
parser.add_argument('--training') | ||
args = parser.parse_args() | ||
|
||
log.info('BEGIN') | ||
# Read test data and split | ||
log.info('Reading test data') | ||
df_test = pd.read_csv(args.test, index_col=0, dtype=np.float64) | ||
X_test = df_test.drop(columns=[args.target]).to_numpy() | ||
y_test = df_test[args.target].to_numpy() | ||
|
||
# Load model | ||
log.info('Loading model') | ||
loaded_model = joblib.load('model.joblib') | ||
# Ensure predict_proba method is implemented | ||
model = model_with_proba(loaded_model, args) | ||
|
||
# Predict on unseen data | ||
log.info('Predicting') | ||
y_pred = model.predict(X_test) | ||
y_prob = model.predict_proba(X_test) | ||
target_1_prob = y_prob[:,1] | ||
|
||
# Calculate metrics | ||
log.info('Calculating metrics') | ||
scores = [ | ||
('acc', accuracy_score), | ||
('roc', roc_auc_score), | ||
('prec', precision_score), | ||
('recall', recall_score), | ||
('f1', f1_score), | ||
('mcc', matthews_corrcoef) | ||
] | ||
|
||
# Calculate metrics | ||
results = [] | ||
for score in scores: | ||
results.append((score[0], score[1](y_test, y_pred))) | ||
df_results = pd.DataFrame(results, columns=[f'score', f'result']) | ||
|
||
# Get graph data for ROC and prec-rec | ||
log.info('Calculating ROC and prec-rec curve data') | ||
roc_data = roc_curve(y_test, target_1_prob) | ||
prec_data = precision_recall_curve(y_test, target_1_prob) | ||
df_roc = pd.DataFrame(roc_data, index=['fpr','tpr','thresholds']).T | ||
df_prec = pd.DataFrame(prec_data, index=['prec', 'rec', 'thresholds']).T | ||
|
||
# Write metrics to files | ||
df_results.to_csv(f'metrics.csv') | ||
df_roc.to_csv(f'roc_data.csv') | ||
df_prec.to_csv(f'precrec_data.csv') | ||
log.info('END') | ||
|
||
if __name__ == '__main__': | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
import sys | ||
from logbook import Logger, NestedSetup, StreamHandler, FileHandler, StringFormatterHandlerMixin, NullHandler | ||
|
||
format_string='[{record.time:%y%m%d %H:%M}] {record.level_name}: snakepot {record.channel}: {record.message}' | ||
|
||
NestedSetup([FileHandler('logfile.log', format_string=format_string, level='DEBUG'), | ||
StreamHandler(sys.stderr, format_string=format_string, bubble=True)]).push_application() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
"""predict.py - Predict scores for unlabelled data""" | ||
|
||
import sys | ||
import argparse | ||
import pandas as pd | ||
import numpy as np | ||
import joblib | ||
from evaluate import get_training_data, model_with_proba | ||
|
||
def main(): | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument('--unlabelled') | ||
parser.add_argument('--training') | ||
parser.add_argument('--target') | ||
args = parser.parse_args() | ||
|
||
# Load data | ||
unlabelled = pd.read_csv(args.unlabelled, index_col=0, dtype=np.float64) | ||
|
||
# Load model | ||
loaded_model = joblib.load('model.joblib') | ||
# Ensure predict_proba method is implemented | ||
model = model_with_proba(loaded_model, args) | ||
|
||
# Predict | ||
y_pred = model.predict(unlabelled) | ||
y_prob = model.predict_proba(unlabelled)[:,1] # Gets the probabilities for '1' class predictions | ||
|
||
# Bulid a dataframe of scores with indexes from 'unlabelled' | ||
pred_tuple = zip(y_pred, y_prob) | ||
pred_columns = ['prediction', 'probability'] | ||
pred_index = unlabelled.index | ||
pred_dataframe = pd.DataFrame(pred_tuple, columns=pred_columns, index=pred_index) | ||
|
||
# Write scores out | ||
pred_dataframe.to_csv(f'unlabelled_predictions.csv') | ||
|
||
if __name__ == '__main__': | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,89 @@ | ||
#!/usr/bin/env python3 | ||
"""tpot.py | ||
Run tpot on an input training dataset.""" | ||
|
||
import sys | ||
import os | ||
import importlib.util | ||
import joblib | ||
import tempfile | ||
from src.log import Logger | ||
log = Logger('tpot') | ||
|
||
import argparse | ||
import pandas as pd | ||
import numpy as np | ||
from tpot import TPOTClassifier | ||
|
||
|
||
class TPOTCleaner(): | ||
def __init__(self, tpot_file): | ||
with open(tpot_file, 'r') as f: | ||
self.lines = f.readlines() | ||
|
||
@property | ||
def import_lines(self): | ||
lines = self.lines | ||
import_break = lines.index('\n') | ||
import_lines = lines[:import_break] | ||
return import_lines | ||
|
||
@property | ||
def export_lines(self): | ||
lines = self.lines | ||
export_start_line = list(filter(lambda x: 'exported_pipeline = ' in x, lines))[0] | ||
export_list = lines[lines.index(export_start_line):] | ||
export_break = export_list.index('\n') | ||
export_lines = export_list[:export_break] | ||
return export_lines | ||
|
||
def write_out(self, outdir): | ||
with open(outdir, 'w') as f: | ||
f.write("".join(self.import_lines)) | ||
f.write("".join(self.export_lines)) | ||
|
||
def main(): | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument('--training') | ||
parser.add_argument('--target') | ||
parser.add_argument('--outdir') | ||
parser.add_argument('--max_time', type=int) | ||
args = parser.parse_args() | ||
|
||
log.info('BEGIN') | ||
log.info('Loading data') | ||
training = pd.read_csv(args.training, index_col=0, dtype=np.float64) | ||
X_train = training.drop(columns=[args.target]).to_numpy() | ||
y_train = training[args.target].to_numpy() | ||
|
||
# TPOT setup | ||
pipeline_optimizer = TPOTClassifier(max_time_mins=args.max_time, cv=10, n_jobs=-1, | ||
random_state=42, verbosity=2, memory='auto') | ||
|
||
# TPOT run | ||
log.info('Running TPOT') | ||
pipeline_optimizer.fit(X_train, y_train) | ||
pipeline_optimizer.export(f'{args.outdir}/tpot_pipeline.py') | ||
|
||
# Create python file for refitting model | ||
log.info('Cleaning TPOT output file') | ||
# Read varialbe 'exported_pipeline' from TPOT output | ||
tc = TPOTCleaner(f'{args.outdir}/tpot_pipeline.py') | ||
tc.write_out(f'{args.outdir}/tpot_pipe.py') | ||
|
||
# Refit model on training data and save | ||
log.info('Refitting model') | ||
spec = importlib.util.spec_from_file_location("src", f"{args.outdir}/tpot_pipe.py") | ||
mod = importlib.util.module_from_spec(spec) | ||
spec.loader.exec_module(mod) | ||
|
||
model = mod.exported_pipeline | ||
model.fit(X_train, y_train) | ||
|
||
log.info('Saving model') | ||
joblib.dump(model, f'{args.outdir}/model.joblib') | ||
|
||
log.info('END') | ||
|
||
if __name__=="__main__": | ||
main() |
Oops, something went wrong.