Skip to content

Commit

Permalink
Add src code
Browse files Browse the repository at this point in the history
  • Loading branch information
Nana Mensah authored and Nana Mensah committed Sep 6, 2019
1 parent 106d9fa commit e58640f
Show file tree
Hide file tree
Showing 7 changed files with 426 additions and 0 deletions.
Empty file added src/__init__.py
Empty file.
87 changes: 87 additions & 0 deletions src/clean.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
"""clean.py
Drop and encode features specified in config.json.
"""

import os
import sys

import argparse
import category_encoders as ce
import pandas as pd

from src.log import Logger
log = Logger('clean')

class Cleaner():
"""Drop and encode columns in a dataframe. Additionally, drops rows with NA values.
Args:
data (pd.Dataframe): Dataframe containing data
to_drop (List): Column names to drop
to_encode (List): Column names to one-hot encode
Methods:
process: Iterates over class methods applying dataframe
encode: One-hot encodes features
drop_columns: Remove features passed at class initialisation
drop_na_rows: Remove any rows with empty cells
"""
def __init__(self, data, to_drop, to_encode):
self.data = data
self.to_drop = to_drop
self.to_encode = to_encode
# Set list of methods to p
self.operations = [self.encode, self.drop_columns, self.drop_na_rows]

def process(self):
df_process = self.data.copy(deep=True)
for operation in self.operations:
df_process = operation(df_process)
return df_process

def encode(self, df):
encodable = set(self.to_encode).intersection(set(df.columns))
if not encodable: # No columns to encode
return df
else:
encoder = ce.OneHotEncoder(cols=list(encodable), use_cat_names=True, handle_unknown='ignore', return_df=True)
encoded = encoder.fit_transform(df)
return encoded

def drop_columns(self, df):
return df.drop(columns=self.to_drop, errors='ignore')

def drop_na_rows(self, df):
return df.dropna(axis='index', how='any')


def main():
parser = argparse.ArgumentParser()
parser.add_argument('--input')
parser.add_argument('--drop', nargs='+')
parser.add_argument('--encode', nargs='+')
args = parser.parse_args()

# Read data
log.info('BEGIN')
indf = pd.read_csv(args.input, index_col=0)
log.info(f'Input shape {indf.shape}')

# Clean data
cleaner = Cleaner(data=indf, to_drop=args.drop, to_encode=args.encode)
cleaned = cleaner.process()

# Assert only one column contains strings. This will be the target,
# used later to split the data.
string_cols = cleaned.select_dtypes('object').columns
log.debug(string_cols)
assert (len(string_cols)) == 1

# Write data
output = os.path.join('cleaned.csv')
cleaned.to_csv(output)
log.info(f'Output shape {cleaned.shape}')
log.info('END')

if __name__ == "__main__":
main()
110 changes: 110 additions & 0 deletions src/evaluate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
"""evaluate.py - Evaluate model against test data"""

import sys
import os
from src.log import Logger
log = Logger('evaluate')

import argparse
import pandas as pd
import numpy as np
import joblib

from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score, matthews_corrcoef
from sklearn.metrics import roc_curve, precision_recall_curve
from sklearn.calibration import CalibratedClassifierCV

def get_training_data(cli_args):
"""Load the training dataset.
Args:
cli_args: Argparse object with command line arguments
"""
args = cli_args
training = pd.read_csv(args.training, index_col=0, dtype=np.float64)
X_train = training.drop(columns=[args.target]).to_numpy()
y_train = training[args.target].to_numpy()
return X_train, y_train

def model_with_proba(model, cli_args):
"""Return a model with the predict_proba method. A wrapper to catch models that do not implement
this method by default.
Args:
model: An sklearn estimator object
Returns:
model: An sklearn estimator object with the predict_proba() method
"""
# Models without probabilities to check
known = ['LinearSVC']
# Return models with predict_proba
if hasattr(model, 'predict_proba'):
return model
# Wrap model with calibrator for probability prediction if it is in the known list
# Do not refit model, simply calibrate internal data.
elif model.__class__.__name__ in known:
# Wrap input model with calibrator
calib_model = CalibratedClassifierCV(base_estimator=model, cv="prefit")
# Recalibrate on training data
X_train, y_train = get_training_data(cli_args)
calib_model.fit(X_train, y_train)
return calib_model
else:
raise ValueError(f'Model is not in known list and does not have predict_proba() method')

def main():
parser = argparse.ArgumentParser()
parser.add_argument('--test')
parser.add_argument('--target')
parser.add_argument('--training')
args = parser.parse_args()

log.info('BEGIN')
# Read test data and split
log.info('Reading test data')
df_test = pd.read_csv(args.test, index_col=0, dtype=np.float64)
X_test = df_test.drop(columns=[args.target]).to_numpy()
y_test = df_test[args.target].to_numpy()

# Load model
log.info('Loading model')
loaded_model = joblib.load('model.joblib')
# Ensure predict_proba method is implemented
model = model_with_proba(loaded_model, args)

# Predict on unseen data
log.info('Predicting')
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)
target_1_prob = y_prob[:,1]

# Calculate metrics
log.info('Calculating metrics')
scores = [
('acc', accuracy_score),
('roc', roc_auc_score),
('prec', precision_score),
('recall', recall_score),
('f1', f1_score),
('mcc', matthews_corrcoef)
]

# Calculate metrics
results = []
for score in scores:
results.append((score[0], score[1](y_test, y_pred)))
df_results = pd.DataFrame(results, columns=[f'score', f'result'])

# Get graph data for ROC and prec-rec
log.info('Calculating ROC and prec-rec curve data')
roc_data = roc_curve(y_test, target_1_prob)
prec_data = precision_recall_curve(y_test, target_1_prob)
df_roc = pd.DataFrame(roc_data, index=['fpr','tpr','thresholds']).T
df_prec = pd.DataFrame(prec_data, index=['prec', 'rec', 'thresholds']).T

# Write metrics to files
df_results.to_csv(f'metrics.csv')
df_roc.to_csv(f'roc_data.csv')
df_prec.to_csv(f'precrec_data.csv')
log.info('END')

if __name__ == '__main__':
main()
7 changes: 7 additions & 0 deletions src/log.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
import sys
from logbook import Logger, NestedSetup, StreamHandler, FileHandler, StringFormatterHandlerMixin, NullHandler

format_string='[{record.time:%y%m%d %H:%M}] {record.level_name}: snakepot {record.channel}: {record.message}'

NestedSetup([FileHandler('logfile.log', format_string=format_string, level='DEBUG'),
StreamHandler(sys.stderr, format_string=format_string, bubble=True)]).push_application()
39 changes: 39 additions & 0 deletions src/predict.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
"""predict.py - Predict scores for unlabelled data"""

import sys
import argparse
import pandas as pd
import numpy as np
import joblib
from evaluate import get_training_data, model_with_proba

def main():
parser = argparse.ArgumentParser()
parser.add_argument('--unlabelled')
parser.add_argument('--training')
parser.add_argument('--target')
args = parser.parse_args()

# Load data
unlabelled = pd.read_csv(args.unlabelled, index_col=0, dtype=np.float64)

# Load model
loaded_model = joblib.load('model.joblib')
# Ensure predict_proba method is implemented
model = model_with_proba(loaded_model, args)

# Predict
y_pred = model.predict(unlabelled)
y_prob = model.predict_proba(unlabelled)[:,1] # Gets the probabilities for '1' class predictions

# Bulid a dataframe of scores with indexes from 'unlabelled'
pred_tuple = zip(y_pred, y_prob)
pred_columns = ['prediction', 'probability']
pred_index = unlabelled.index
pred_dataframe = pd.DataFrame(pred_tuple, columns=pred_columns, index=pred_index)

# Write scores out
pred_dataframe.to_csv(f'unlabelled_predictions.csv')

if __name__ == '__main__':
main()
89 changes: 89 additions & 0 deletions src/tpot_caller.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
#!/usr/bin/env python3
"""tpot.py
Run tpot on an input training dataset."""

import sys
import os
import importlib.util
import joblib
import tempfile
from src.log import Logger
log = Logger('tpot')

import argparse
import pandas as pd
import numpy as np
from tpot import TPOTClassifier


class TPOTCleaner():
def __init__(self, tpot_file):
with open(tpot_file, 'r') as f:
self.lines = f.readlines()

@property
def import_lines(self):
lines = self.lines
import_break = lines.index('\n')
import_lines = lines[:import_break]
return import_lines

@property
def export_lines(self):
lines = self.lines
export_start_line = list(filter(lambda x: 'exported_pipeline = ' in x, lines))[0]
export_list = lines[lines.index(export_start_line):]
export_break = export_list.index('\n')
export_lines = export_list[:export_break]
return export_lines

def write_out(self, outdir):
with open(outdir, 'w') as f:
f.write("".join(self.import_lines))
f.write("".join(self.export_lines))

def main():
parser = argparse.ArgumentParser()
parser.add_argument('--training')
parser.add_argument('--target')
parser.add_argument('--outdir')
parser.add_argument('--max_time', type=int)
args = parser.parse_args()

log.info('BEGIN')
log.info('Loading data')
training = pd.read_csv(args.training, index_col=0, dtype=np.float64)
X_train = training.drop(columns=[args.target]).to_numpy()
y_train = training[args.target].to_numpy()

# TPOT setup
pipeline_optimizer = TPOTClassifier(max_time_mins=args.max_time, cv=10, n_jobs=-1,
random_state=42, verbosity=2, memory='auto')

# TPOT run
log.info('Running TPOT')
pipeline_optimizer.fit(X_train, y_train)
pipeline_optimizer.export(f'{args.outdir}/tpot_pipeline.py')

# Create python file for refitting model
log.info('Cleaning TPOT output file')
# Read varialbe 'exported_pipeline' from TPOT output
tc = TPOTCleaner(f'{args.outdir}/tpot_pipeline.py')
tc.write_out(f'{args.outdir}/tpot_pipe.py')

# Refit model on training data and save
log.info('Refitting model')
spec = importlib.util.spec_from_file_location("src", f"{args.outdir}/tpot_pipe.py")
mod = importlib.util.module_from_spec(spec)
spec.loader.exec_module(mod)

model = mod.exported_pipeline
model.fit(X_train, y_train)

log.info('Saving model')
joblib.dump(model, f'{args.outdir}/model.joblib')

log.info('END')

if __name__=="__main__":
main()
Loading

0 comments on commit e58640f

Please sign in to comment.