From e58640f21eb5ee6383c201e1760171f3e7f804c3 Mon Sep 17 00:00:00 2001 From: Nana Mensah Date: Fri, 6 Sep 2019 08:25:35 +0100 Subject: [PATCH] Add src code --- src/__init__.py | 0 src/clean.py | 87 +++++++++++++++++++++++++++++++++ src/evaluate.py | 110 ++++++++++++++++++++++++++++++++++++++++++ src/log.py | 7 +++ src/predict.py | 39 +++++++++++++++ src/tpot_caller.py | 89 ++++++++++++++++++++++++++++++++++ src/train_val_pred.py | 94 ++++++++++++++++++++++++++++++++++++ 7 files changed, 426 insertions(+) create mode 100644 src/__init__.py create mode 100644 src/clean.py create mode 100644 src/evaluate.py create mode 100644 src/log.py create mode 100644 src/predict.py create mode 100644 src/tpot_caller.py create mode 100644 src/train_val_pred.py diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/clean.py b/src/clean.py new file mode 100644 index 0000000..22024e4 --- /dev/null +++ b/src/clean.py @@ -0,0 +1,87 @@ +"""clean.py + +Drop and encode features specified in config.json. +""" + +import os +import sys + +import argparse +import category_encoders as ce +import pandas as pd + +from src.log import Logger +log = Logger('clean') + +class Cleaner(): + """Drop and encode columns in a dataframe. Additionally, drops rows with NA values. + + Args: + data (pd.Dataframe): Dataframe containing data + to_drop (List): Column names to drop + to_encode (List): Column names to one-hot encode + Methods: + process: Iterates over class methods applying dataframe + encode: One-hot encodes features + drop_columns: Remove features passed at class initialisation + drop_na_rows: Remove any rows with empty cells + """ + def __init__(self, data, to_drop, to_encode): + self.data = data + self.to_drop = to_drop + self.to_encode = to_encode + # Set list of methods to p + self.operations = [self.encode, self.drop_columns, self.drop_na_rows] + + def process(self): + df_process = self.data.copy(deep=True) + for operation in self.operations: + df_process = operation(df_process) + return df_process + + def encode(self, df): + encodable = set(self.to_encode).intersection(set(df.columns)) + if not encodable: # No columns to encode + return df + else: + encoder = ce.OneHotEncoder(cols=list(encodable), use_cat_names=True, handle_unknown='ignore', return_df=True) + encoded = encoder.fit_transform(df) + return encoded + + def drop_columns(self, df): + return df.drop(columns=self.to_drop, errors='ignore') + + def drop_na_rows(self, df): + return df.dropna(axis='index', how='any') + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('--input') + parser.add_argument('--drop', nargs='+') + parser.add_argument('--encode', nargs='+') + args = parser.parse_args() + + # Read data + log.info('BEGIN') + indf = pd.read_csv(args.input, index_col=0) + log.info(f'Input shape {indf.shape}') + + # Clean data + cleaner = Cleaner(data=indf, to_drop=args.drop, to_encode=args.encode) + cleaned = cleaner.process() + + # Assert only one column contains strings. This will be the target, + # used later to split the data. + string_cols = cleaned.select_dtypes('object').columns + log.debug(string_cols) + assert (len(string_cols)) == 1 + + # Write data + output = os.path.join('cleaned.csv') + cleaned.to_csv(output) + log.info(f'Output shape {cleaned.shape}') + log.info('END') + +if __name__ == "__main__": + main() diff --git a/src/evaluate.py b/src/evaluate.py new file mode 100644 index 0000000..45fad8a --- /dev/null +++ b/src/evaluate.py @@ -0,0 +1,110 @@ +"""evaluate.py - Evaluate model against test data""" + +import sys +import os +from src.log import Logger +log = Logger('evaluate') + +import argparse +import pandas as pd +import numpy as np +import joblib + +from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score, matthews_corrcoef +from sklearn.metrics import roc_curve, precision_recall_curve +from sklearn.calibration import CalibratedClassifierCV + +def get_training_data(cli_args): + """Load the training dataset. + Args: + cli_args: Argparse object with command line arguments + """ + args = cli_args + training = pd.read_csv(args.training, index_col=0, dtype=np.float64) + X_train = training.drop(columns=[args.target]).to_numpy() + y_train = training[args.target].to_numpy() + return X_train, y_train + +def model_with_proba(model, cli_args): + """Return a model with the predict_proba method. A wrapper to catch models that do not implement + this method by default. + Args: + model: An sklearn estimator object + Returns: + model: An sklearn estimator object with the predict_proba() method + """ + # Models without probabilities to check + known = ['LinearSVC'] + # Return models with predict_proba + if hasattr(model, 'predict_proba'): + return model + # Wrap model with calibrator for probability prediction if it is in the known list + # Do not refit model, simply calibrate internal data. + elif model.__class__.__name__ in known: + # Wrap input model with calibrator + calib_model = CalibratedClassifierCV(base_estimator=model, cv="prefit") + # Recalibrate on training data + X_train, y_train = get_training_data(cli_args) + calib_model.fit(X_train, y_train) + return calib_model + else: + raise ValueError(f'Model is not in known list and does not have predict_proba() method') + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('--test') + parser.add_argument('--target') + parser.add_argument('--training') + args = parser.parse_args() + + log.info('BEGIN') + # Read test data and split + log.info('Reading test data') + df_test = pd.read_csv(args.test, index_col=0, dtype=np.float64) + X_test = df_test.drop(columns=[args.target]).to_numpy() + y_test = df_test[args.target].to_numpy() + + # Load model + log.info('Loading model') + loaded_model = joblib.load('model.joblib') + # Ensure predict_proba method is implemented + model = model_with_proba(loaded_model, args) + + # Predict on unseen data + log.info('Predicting') + y_pred = model.predict(X_test) + y_prob = model.predict_proba(X_test) + target_1_prob = y_prob[:,1] + + # Calculate metrics + log.info('Calculating metrics') + scores = [ + ('acc', accuracy_score), + ('roc', roc_auc_score), + ('prec', precision_score), + ('recall', recall_score), + ('f1', f1_score), + ('mcc', matthews_corrcoef) + ] + + # Calculate metrics + results = [] + for score in scores: + results.append((score[0], score[1](y_test, y_pred))) + df_results = pd.DataFrame(results, columns=[f'score', f'result']) + + # Get graph data for ROC and prec-rec + log.info('Calculating ROC and prec-rec curve data') + roc_data = roc_curve(y_test, target_1_prob) + prec_data = precision_recall_curve(y_test, target_1_prob) + df_roc = pd.DataFrame(roc_data, index=['fpr','tpr','thresholds']).T + df_prec = pd.DataFrame(prec_data, index=['prec', 'rec', 'thresholds']).T + + # Write metrics to files + df_results.to_csv(f'metrics.csv') + df_roc.to_csv(f'roc_data.csv') + df_prec.to_csv(f'precrec_data.csv') + log.info('END') + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/src/log.py b/src/log.py new file mode 100644 index 0000000..ca2cdde --- /dev/null +++ b/src/log.py @@ -0,0 +1,7 @@ +import sys +from logbook import Logger, NestedSetup, StreamHandler, FileHandler, StringFormatterHandlerMixin, NullHandler + +format_string='[{record.time:%y%m%d %H:%M}] {record.level_name}: snakepot {record.channel}: {record.message}' + +NestedSetup([FileHandler('logfile.log', format_string=format_string, level='DEBUG'), + StreamHandler(sys.stderr, format_string=format_string, bubble=True)]).push_application() diff --git a/src/predict.py b/src/predict.py new file mode 100644 index 0000000..1f9bf89 --- /dev/null +++ b/src/predict.py @@ -0,0 +1,39 @@ +"""predict.py - Predict scores for unlabelled data""" + +import sys +import argparse +import pandas as pd +import numpy as np +import joblib +from evaluate import get_training_data, model_with_proba + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('--unlabelled') + parser.add_argument('--training') + parser.add_argument('--target') + args = parser.parse_args() + + # Load data + unlabelled = pd.read_csv(args.unlabelled, index_col=0, dtype=np.float64) + + # Load model + loaded_model = joblib.load('model.joblib') + # Ensure predict_proba method is implemented + model = model_with_proba(loaded_model, args) + + # Predict + y_pred = model.predict(unlabelled) + y_prob = model.predict_proba(unlabelled)[:,1] # Gets the probabilities for '1' class predictions + + # Bulid a dataframe of scores with indexes from 'unlabelled' + pred_tuple = zip(y_pred, y_prob) + pred_columns = ['prediction', 'probability'] + pred_index = unlabelled.index + pred_dataframe = pd.DataFrame(pred_tuple, columns=pred_columns, index=pred_index) + + # Write scores out + pred_dataframe.to_csv(f'unlabelled_predictions.csv') + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/src/tpot_caller.py b/src/tpot_caller.py new file mode 100644 index 0000000..a8d5d2a --- /dev/null +++ b/src/tpot_caller.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python3 +"""tpot.py +Run tpot on an input training dataset.""" + +import sys +import os +import importlib.util +import joblib +import tempfile +from src.log import Logger +log = Logger('tpot') + +import argparse +import pandas as pd +import numpy as np +from tpot import TPOTClassifier + + +class TPOTCleaner(): + def __init__(self, tpot_file): + with open(tpot_file, 'r') as f: + self.lines = f.readlines() + + @property + def import_lines(self): + lines = self.lines + import_break = lines.index('\n') + import_lines = lines[:import_break] + return import_lines + + @property + def export_lines(self): + lines = self.lines + export_start_line = list(filter(lambda x: 'exported_pipeline = ' in x, lines))[0] + export_list = lines[lines.index(export_start_line):] + export_break = export_list.index('\n') + export_lines = export_list[:export_break] + return export_lines + + def write_out(self, outdir): + with open(outdir, 'w') as f: + f.write("".join(self.import_lines)) + f.write("".join(self.export_lines)) + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('--training') + parser.add_argument('--target') + parser.add_argument('--outdir') + parser.add_argument('--max_time', type=int) + args = parser.parse_args() + + log.info('BEGIN') + log.info('Loading data') + training = pd.read_csv(args.training, index_col=0, dtype=np.float64) + X_train = training.drop(columns=[args.target]).to_numpy() + y_train = training[args.target].to_numpy() + + # TPOT setup + pipeline_optimizer = TPOTClassifier(max_time_mins=args.max_time, cv=10, n_jobs=-1, + random_state=42, verbosity=2, memory='auto') + + # TPOT run + log.info('Running TPOT') + pipeline_optimizer.fit(X_train, y_train) + pipeline_optimizer.export(f'{args.outdir}/tpot_pipeline.py') + + # Create python file for refitting model + log.info('Cleaning TPOT output file') + # Read varialbe 'exported_pipeline' from TPOT output + tc = TPOTCleaner(f'{args.outdir}/tpot_pipeline.py') + tc.write_out(f'{args.outdir}/tpot_pipe.py') + + # Refit model on training data and save + log.info('Refitting model') + spec = importlib.util.spec_from_file_location("src", f"{args.outdir}/tpot_pipe.py") + mod = importlib.util.module_from_spec(spec) + spec.loader.exec_module(mod) + + model = mod.exported_pipeline + model.fit(X_train, y_train) + + log.info('Saving model') + joblib.dump(model, f'{args.outdir}/model.joblib') + + log.info('END') + +if __name__=="__main__": + main() \ No newline at end of file diff --git a/src/train_val_pred.py b/src/train_val_pred.py new file mode 100644 index 0000000..1ce0e66 --- /dev/null +++ b/src/train_val_pred.py @@ -0,0 +1,94 @@ +"""train_val_pred.py - Split dataframe based on target variable""" + +import sys +import os +from src.log import Logger +log = Logger('clean') + +import argparse +import category_encoders as ce +import pandas as pd + +class DataSplitter(): + def __init__(self, df, target, target_1, to_predict, perc_split): + self.df = df + self.target = target + assert df[target].nunique() == 3 + self.target_1 = target_1 + self.to_predict = to_predict + self.perc_split = perc_split + self.encoder = ce.OneHotEncoder(cols=[target], use_cat_names=True, return_df=True, drop_invariant=True) + self.encoded = self.encoder.fit_transform(self.df) + + def get_test_train(self): + df = self._get_labelled() + # Get dataframes for each of the binary outputs + df_targ1 = df[df[self.target] == 1] + df_targ0 = df[df[self.target] == 0] + # Split each dataframe by the input fraction. Test dataset contains the percentage split + df_targ1_test, df_targ1_train = self._perc_splitter(df_targ1, self.perc_split) + df_targ0_test, df_targ0_train = self._perc_splitter(df_targ0, self.perc_split) + # Combine training and test datasets + test = pd.concat([df_targ1_test, df_targ0_test]) + training = pd.concat([df_targ1_train, df_targ0_train]) + return (test, training) + + def _get_labelled(self): + col_to_predict = f'{self.target}_{self.to_predict}' + col_to_train = f'{self.target}_{self.target_1}' + encoded_target_1 = self.encoded[col_to_train] + to_train_bool = (self.encoded[col_to_predict] == 0) + df_to_train = self.df[to_train_bool].drop(columns=[self.target]) + df_to_train[self.target] = encoded_target_1[to_train_bool] + return df_to_train.sample(frac=1,random_state=42) + + def get_to_predict(self): + col_to_predict = f'{self.target}_{self.to_predict}' + to_predict_bool = (self.encoded[col_to_predict] == 1) + df_to_predict = self.df[to_predict_bool].drop(columns=[self.target]) + return df_to_predict.sample(frac=1, random_state=42) + + def _perc_splitter(self, df, perc): + """Splits a dataframe (df) into two by some fraction (perc). + Returns: + split_data(tuple): split_by_perc, data_remainder""" + # Split the dataframe by percentage + split_by_perc = df.sample(frac=perc) + # Get the remainder dataframe using the split data index + data_remainder = df.drop(index=split_by_perc.index) + # Return result + return split_by_perc, data_remainder + + +def main(): + # Read data + parser = argparse.ArgumentParser() + for argument in ['--cleaned', '--target_column', '--target_1', '--target_0', '--to_predict']: + parser.add_argument(argument) + parser.add_argument('--perc_split', type=float) + args = parser.parse_args() + + log.info('BEGIN') + + # Read data + cleaned = pd.read_csv(args.cleaned, index_col=0) + + # Encode target + log.info('Encoding data and asserting 3 unique values in target column') + ds = DataSplitter(cleaned, args.target_column, args.target_1, args.to_predict, args.perc_split) + + # Split and write validation data + log.info('Getting unlabelled dataset') + unlabelled = ds.get_to_predict() + unlabelled.to_csv('unlabelled.csv') + + # Get test and train data. Write to output files. + log.info('Getting training and test datasets') + test, training = ds.get_test_train() + training.to_csv('training.csv') + test.to_csv('test.csv') + + log.info('END') + +if __name__ == '__main__': + main() \ No newline at end of file