Add src code

NMNS93 · Sep 6, 2019 · e58640f · e58640f
1 parent 106d9fa
commit e58640f
Show file tree

Hide file tree

Showing 7 changed files with 426 additions and 0 deletions.
diff --git a/src/__init__.py b/src/__init__.py
diff --git a/src/clean.py b/src/clean.py
@@ -0,0 +1,87 @@
+"""clean.py
+
+Drop and encode features specified in config.json.
+"""
+
+import os
+import sys
+
+import argparse
+import category_encoders as ce
+import pandas as pd
+
+from src.log import Logger
+log = Logger('clean')
+
+class Cleaner():
+    """Drop and encode columns in a dataframe. Additionally, drops rows with NA values.
+
+    Args:
+        data (pd.Dataframe): Dataframe containing data
+        to_drop (List): Column names to drop
+        to_encode (List): Column names to one-hot encode
+    Methods:
+        process: Iterates over class methods applying dataframe
+        encode: One-hot encodes features
+        drop_columns: Remove features passed at class initialisation
+        drop_na_rows: Remove any rows with empty cells
+    """
+    def __init__(self, data, to_drop, to_encode):
+        self.data = data
+        self.to_drop = to_drop
+        self.to_encode = to_encode
+        # Set list of methods to p
+        self.operations = [self.encode, self.drop_columns, self.drop_na_rows]
+
+    def process(self):
+        df_process = self.data.copy(deep=True)
+        for operation in self.operations:
+            df_process = operation(df_process)
+        return df_process
+
+    def encode(self, df):
+        encodable = set(self.to_encode).intersection(set(df.columns))
+        if not encodable: # No columns to encode
+            return df
+        else:
+            encoder = ce.OneHotEncoder(cols=list(encodable), use_cat_names=True, handle_unknown='ignore', return_df=True)
+            encoded = encoder.fit_transform(df)
+            return encoded
+
+    def drop_columns(self, df):
+        return df.drop(columns=self.to_drop, errors='ignore')
+
+    def drop_na_rows(self, df):
+        return df.dropna(axis='index', how='any')
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--input')
+    parser.add_argument('--drop', nargs='+')
+    parser.add_argument('--encode', nargs='+')
+    args = parser.parse_args()
+
+    # Read data
+    log.info('BEGIN')
+    indf = pd.read_csv(args.input, index_col=0)
+    log.info(f'Input shape {indf.shape}')
+
+    # Clean data
+    cleaner = Cleaner(data=indf, to_drop=args.drop, to_encode=args.encode)
+    cleaned = cleaner.process()
+
+    # Assert only one column contains strings. This will be the target,
+    # used later to split the data.
+    string_cols = cleaned.select_dtypes('object').columns
+    log.debug(string_cols) 
+    assert (len(string_cols)) == 1
+
+    # Write data
+    output = os.path.join('cleaned.csv')
+    cleaned.to_csv(output)
+    log.info(f'Output shape {cleaned.shape}')
+    log.info('END')
+
+if __name__ == "__main__":
+    main()
diff --git a/src/evaluate.py b/src/evaluate.py
@@ -0,0 +1,110 @@
+"""evaluate.py - Evaluate model against test data"""
+
+import sys
+import os
+from src.log import Logger
+log = Logger('evaluate')
+
+import argparse
+import pandas as pd
+import numpy as np
+import joblib
+
+from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score, matthews_corrcoef
+from sklearn.metrics import roc_curve, precision_recall_curve
+from sklearn.calibration import CalibratedClassifierCV
+
+def get_training_data(cli_args):
+    """Load the training dataset.
+    Args:
+        cli_args: Argparse object with command line arguments
+    """
+    args = cli_args
+    training = pd.read_csv(args.training, index_col=0, dtype=np.float64)
+    X_train = training.drop(columns=[args.target]).to_numpy()
+    y_train = training[args.target].to_numpy()
+    return X_train, y_train
+
+def model_with_proba(model, cli_args):
+    """Return a model with the predict_proba method. A wrapper to catch models that do not implement
+    this method by default.
+    Args:
+        model: An sklearn estimator object
+    Returns:
+        model: An sklearn estimator object with the predict_proba() method
+    """
+    # Models without probabilities to check
+    known = ['LinearSVC']
+    # Return models with predict_proba
+    if hasattr(model, 'predict_proba'):
+        return model
+    # Wrap model with calibrator for probability prediction if it is in the known list
+    # Do not refit model, simply calibrate internal data.
+    elif model.__class__.__name__ in known:
+        # Wrap input model with calibrator
+        calib_model = CalibratedClassifierCV(base_estimator=model, cv="prefit")
+        # Recalibrate on training data
+        X_train, y_train = get_training_data(cli_args)
+        calib_model.fit(X_train, y_train)
+        return calib_model
+    else:
+        raise ValueError(f'Model is not in known list and does not have predict_proba() method')
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--test')
+    parser.add_argument('--target')
+    parser.add_argument('--training')
+    args = parser.parse_args()
+
+    log.info('BEGIN')
+    # Read test data and split
+    log.info('Reading test data')
+    df_test = pd.read_csv(args.test, index_col=0, dtype=np.float64)
+    X_test = df_test.drop(columns=[args.target]).to_numpy()
+    y_test = df_test[args.target].to_numpy()
+
+    # Load model
+    log.info('Loading model')
+    loaded_model = joblib.load('model.joblib')
+    # Ensure predict_proba method is implemented
+    model = model_with_proba(loaded_model, args)
+
+    # Predict on unseen data
+    log.info('Predicting')
+    y_pred = model.predict(X_test)
+    y_prob = model.predict_proba(X_test)
+    target_1_prob = y_prob[:,1]
+
+    # Calculate metrics
+    log.info('Calculating metrics')
+    scores = [
+        ('acc', accuracy_score),
+        ('roc', roc_auc_score),
+        ('prec', precision_score),
+        ('recall', recall_score),
+        ('f1', f1_score),
+        ('mcc', matthews_corrcoef)
+    ]
+
+    # Calculate metrics
+    results = []
+    for score in scores:
+        results.append((score[0], score[1](y_test, y_pred)))
+    df_results = pd.DataFrame(results, columns=[f'score', f'result'])
+
+    # Get graph data for ROC and prec-rec
+    log.info('Calculating ROC and prec-rec curve data')
+    roc_data = roc_curve(y_test, target_1_prob)
+    prec_data = precision_recall_curve(y_test, target_1_prob)
+    df_roc = pd.DataFrame(roc_data, index=['fpr','tpr','thresholds']).T
+    df_prec = pd.DataFrame(prec_data, index=['prec', 'rec', 'thresholds']).T
+
+    # Write metrics to files
+    df_results.to_csv(f'metrics.csv')
+    df_roc.to_csv(f'roc_data.csv')
+    df_prec.to_csv(f'precrec_data.csv')
+    log.info('END')
+
+if __name__ == '__main__':
+    main()
diff --git a/src/log.py b/src/log.py
@@ -0,0 +1,7 @@
+import sys
+from logbook import Logger, NestedSetup, StreamHandler, FileHandler, StringFormatterHandlerMixin, NullHandler
+
+format_string='[{record.time:%y%m%d %H:%M}] {record.level_name}: snakepot {record.channel}:  {record.message}'
+
+NestedSetup([FileHandler('logfile.log', format_string=format_string, level='DEBUG'), 
+             StreamHandler(sys.stderr, format_string=format_string, bubble=True)]).push_application()
diff --git a/src/predict.py b/src/predict.py
@@ -0,0 +1,39 @@
+"""predict.py - Predict scores for unlabelled data"""
+
+import sys
+import argparse
+import pandas as pd
+import numpy as np
+import joblib
+from evaluate import get_training_data, model_with_proba
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--unlabelled')
+    parser.add_argument('--training')
+    parser.add_argument('--target') 
+    args = parser.parse_args()    
+
+    # Load data
+    unlabelled = pd.read_csv(args.unlabelled, index_col=0, dtype=np.float64)
+
+    # Load model
+    loaded_model = joblib.load('model.joblib')
+    # Ensure predict_proba method is implemented
+    model = model_with_proba(loaded_model, args)
+
+    # Predict
+    y_pred = model.predict(unlabelled) 
+    y_prob = model.predict_proba(unlabelled)[:,1] # Gets the probabilities for '1' class predictions
+
+    # Bulid a dataframe of scores with indexes from 'unlabelled'
+    pred_tuple = zip(y_pred, y_prob)
+    pred_columns = ['prediction', 'probability']
+    pred_index = unlabelled.index
+    pred_dataframe = pd.DataFrame(pred_tuple, columns=pred_columns, index=pred_index)
+
+    # Write scores out
+    pred_dataframe.to_csv(f'unlabelled_predictions.csv')
+
+if __name__ == '__main__':
+    main()
diff --git a/src/tpot_caller.py b/src/tpot_caller.py
@@ -0,0 +1,89 @@
+#!/usr/bin/env python3
+"""tpot.py
+Run tpot on an input training dataset."""
+
+import sys
+import os
+import importlib.util
+import joblib
+import tempfile
+from src.log import Logger
+log = Logger('tpot')
+
+import argparse
+import pandas as pd
+import numpy as np
+from tpot import TPOTClassifier
+
+
+class TPOTCleaner():
+    def __init__(self, tpot_file):
+        with open(tpot_file, 'r') as f:
+            self.lines = f.readlines()
+
+    @property
+    def import_lines(self):
+        lines = self.lines
+        import_break = lines.index('\n')
+        import_lines = lines[:import_break]
+        return import_lines
+
+    @property
+    def export_lines(self):
+        lines = self.lines
+        export_start_line = list(filter(lambda x: 'exported_pipeline = ' in x, lines))[0]
+        export_list = lines[lines.index(export_start_line):]
+        export_break = export_list.index('\n')
+        export_lines = export_list[:export_break]
+        return export_lines
+
+    def write_out(self, outdir):
+        with open(outdir, 'w') as f:
+            f.write("".join(self.import_lines))
+            f.write("".join(self.export_lines))
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--training')
+    parser.add_argument('--target')
+    parser.add_argument('--outdir')
+    parser.add_argument('--max_time', type=int)
+    args = parser.parse_args()
+
+    log.info('BEGIN')
+    log.info('Loading data')
+    training = pd.read_csv(args.training, index_col=0, dtype=np.float64)
+    X_train = training.drop(columns=[args.target]).to_numpy()
+    y_train = training[args.target].to_numpy()
+
+    # TPOT setup
+    pipeline_optimizer = TPOTClassifier(max_time_mins=args.max_time, cv=10, n_jobs=-1, 
+        random_state=42, verbosity=2, memory='auto')
+
+    # TPOT run
+    log.info('Running TPOT')
+    pipeline_optimizer.fit(X_train, y_train)
+    pipeline_optimizer.export(f'{args.outdir}/tpot_pipeline.py')
+
+    # Create python file for refitting model
+    log.info('Cleaning TPOT output file')
+    # Read varialbe 'exported_pipeline' from TPOT output
+    tc = TPOTCleaner(f'{args.outdir}/tpot_pipeline.py')
+    tc.write_out(f'{args.outdir}/tpot_pipe.py')
+
+    # Refit model on training data and save
+    log.info('Refitting model')
+    spec = importlib.util.spec_from_file_location("src", f"{args.outdir}/tpot_pipe.py")
+    mod = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(mod)
+
+    model = mod.exported_pipeline
+    model.fit(X_train, y_train)
+
+    log.info('Saving model')
+    joblib.dump(model, f'{args.outdir}/model.joblib')
+
+    log.info('END')
+
+if __name__=="__main__":
+    main()