Skip to content

Commit

Permalink
Create snakefile and config
Browse files Browse the repository at this point in the history
  • Loading branch information
Nana Mensah authored and Nana Mensah committed Sep 6, 2019
1 parent e58640f commit c8cf6e2
Show file tree
Hide file tree
Showing 2 changed files with 120 additions and 0 deletions.
103 changes: 103 additions & 0 deletions Snakefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
"""Snakefile
A snakemake workflow (snakepot) for calling the TPOT binary classifier:
rule copy: Copy the input dataset to the working directory
rule clean: Drop columns and rows with empty data
rule split: Create train, test, and validation data subsets
rule tpot: Run the TPOTClassifier with training and test data
rule evaluate: Evaluate model on validation data
rule predict: Predict unlabelled values in the input dataset
rule end: Copy config.json to working directory to complete snakepot
"""

def expand_config(inlist):
"""Quote argument lists to avoid errors in shell command.
Args:
inlist: A list of values to be converted to strings and quoted
Returns:
A list of quoted strings
"""
return [ f"'{str(i)}'" for i in inlist ]

# Set config file
configfile: "config.json"
# Create and set working directory
workdir: config['directory']
# Set snakefile directory
snakedir: "../{}".format(config['directory'])

# Set build target
rule all:
input: "config.json"

# STEP 0: Copy the input dataset into the working directory
rule copy:
input: f"../{config['input']}"
output: "data.csv"
shell: "cp {input} data.csv"

# STEP 1: Clean the dataset.
rule clean:
# Read the dataset
input: "data.csv"
output: "cleaned.csv"
# Set parameters for columns to drop and one-hot encode
params:
drop=expand_config(config['drop_columns']),
encode=expand_config(config['encode_columns']),
# Call clean.py
shell:
"python ../src/clean.py --input {input} --drop {params.drop} --encode {params.encode}"

rule split:
input: "cleaned.csv"
output: "training.csv", "test.csv", "unlabelled.csv"
params:
target_column=config['target_column'],
target_1=config['target_1'],
target_0=config['target_0'],
to_predict=config['to_predict'],
perc_split=config['perc_split'],
shell:
"python ../src/train_val_pred.py --cleaned {input} --target_column {params.target_column}"
" --target_1 {params.target_1} --target_0 {params.target_0} --to_predict {params.to_predict}"
" --perc_split {params.perc_split}"

rule tpot:
input: "training.csv"
output: "tpot_pipe.py", "model.joblib"
params:
target=config['target_column'],
max=config['TPOT_max_time'],
outdir=config['directory'] # Note this argument is relative to /src
shell:
"python ../src/tpot_caller.py --training {input} --target {params.target} "
"--max_time {params.max} --outdir ../{params.outdir}"


rule evaluate:
input:
test="test.csv",
training="training.csv"
output: "metrics.csv", "roc_data.csv", "precrec_data.csv"
params:
target=config['target_column'],
shell:
"python ../src/evaluate.py --test {input.test} --target {params.target} --training {input.training}"

rule predict:
input:
unlabelled="unlabelled.csv",
model="model.joblib",
training="training.csv"
output: "unlabelled_predictions.csv"
params: target=config['target_column']
shell:
"python ../src/predict.py --unlabelled {input.unlabelled} --training {input.training} --target {params.target}"

rule end:
input: "unlabelled_predictions.csv"
output: "config.json"
shell:
"cp ../config.json ."
17 changes: 17 additions & 0 deletions config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
{
"directory": "snakepot_test",
"input": "test/test.csv",
"drop_columns": [
"OO1", "OO2", "OO12", "OO12", "OO13", "OO14", "OO25", "OO28", "OO29", "OO35", "OO36",
"OO38", "OO37", "OO40", "OO39"
],
"encode_columns": [
"0045", "OO43", "OO44", "OO41", "OO42"
],
"target_column": "target",
"target_1": "BENIGN",
"target_0": "DELETERIOUS",
"to_predict": "ND",
"perc_split": 0.2,
"TPOT_max_time": 5
}

0 comments on commit c8cf6e2

Please sign in to comment.