-
Notifications
You must be signed in to change notification settings - Fork 0
/
run.py
executable file
·157 lines (135 loc) · 6.62 KB
/
run.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
#!/usr/bin/env python
"""
An additional script to run a series of experiments described in table like
etc/experiments.csv where columns are hyperparameters and rows are experiments.
"""
# TODO: fix docstrings
import os
import matplotlib
matplotlib.use('Agg') # plot without a running X server
import math
import random
import pandas as pd
import numpy as np
from joblib import Parallel, delayed
from joblib.parallel import BACKENDS
from moloi.moloi import experiment
from moloi.dictionaries import results_dict
from moloi.results import process_results
BACKEND = 'loki'
if BACKEND not in BACKENDS.keys():
BACKEND = 'multiprocessing'
def isnan(x):
""" Checking if the variable is NaN and type float. """
return isinstance(x, float) and math.isnan(x)
def worker(i, table, exp_settings, common_gridsearch, result_cols, keys, params):
"""
Check the rows of experiments_file in a loop. If there are no results in the row
(empty fields after len(cols)), it takes all values in this row and calls the experiment
function until all result fields are filled with step len(result_cols).
Params
------
params: list
the names of the columns of not positional arguments in experiments_file
keys: list
not positional arguments of experiment function (run_model.py) in the same order as params
result_cols: list
the result metrics which will be added to experiments_file
[accuracy_test, accuracy_train, rec, auc, auc_val, f1, gparams]
common_gridsearch: bool
one gridsearch for all experiments in row
random_state: int
random state of all
experiments_file: string
path to experiments table
"""
rparams = False
command = []
for c, p in enumerate(params):
if not isnan(table[p][i]):
if keys[c] in ["-g", "--dummy"]:
command.append(keys[c])
elif keys[c] in ["--n_bits", "--n_cv", "--n_jobs", "-p", "--n_iter"]:
command.append(keys[c])
command.append(int(table[p][i]))
elif keys[c] in ["--split"]:
command.append(keys[c])
command.append(float(table[p][i]))
else:
command.append(keys[c])
command.append(str(table[p][i]))
command.append("-e")
command.append(experiments_file)
for j in range(int(((table.shape[1] - len(params)) / len(result_cols)))):
results = results_dict()
final_command = command + ["-t"] + [int(j)]
print(final_command)
if isnan(table.iloc[i, j*len(result_cols) + len(params)]):
if not common_gridsearch:
rparams = False
exp_settings['rparams'] = rparams
results = experiment(final_command, exp_settings, results)
model_address = results["model_address"]
results["gparams"] = results["rparams"]
# round results
for key in results:
if isinstance(results[key], (str, dict)):
continue
elif isinstance(results[key], (list, np.ndarray)):
for p in range(len(results[key])):
results[key][p] = round(results[key][p], 4)
else:
results[key] = round(results[key], 4)
table = pd.read_csv(experiments_file)
for p, r in enumerate(result_cols):
table.iloc[i, j * len(result_cols) + len(params) + p] = str(results[r])
if model_address:
table.iloc[i, 4] = str(model_address) # set on Load model column
if "--load_model" not in command and common_gridsearch:
command.append("--load_model")
command.append(model_address)
table.to_csv(experiments_file, index=False)
def main(experiments_file, common_gridsearch, random_state, result_cols, keys, params, verbose, n_jobs, refit, plots, callbacks):
""" Process inputs and start workers. """
logger_flag = False
if not random_state and not isinstance(random_state, int):
random_state = random.randint(1, 100)
np.random.seed(random_state)
# tf.set_random_seed(random_state)
table = pd.read_csv(experiments_file)
if n_jobs > table.shape[0]:
n_jobs = table.shape[0] - 1
exp_settings = {
'experiments_file': os.path.basename(experiments_file).split('.')[0],
'random_state': random_state,
'rparams': False,
'verbose': verbose,
'refit': refit,
'plots': plots,
'callbacks': callbacks
}
Parallel(n_jobs=n_jobs, backend=BACKEND, verbose=verbose)(delayed(worker)(i, table, exp_settings, common_gridsearch, result_cols, keys, params) for i in range(table.shape[0]))
if __name__ == "__main__":
keys = ["--load_model", "--output", "--model_config", "--descriptors", "--n_bits", "--n_cv",
"--n_jobs", "-p", "-g", "--n_iter", "--metric", "--split_type", "--split_s",
'--select_model', '--data_config']
params = ["Load model", "Output", "Model config", "Descriptors", "n_bits", "n_cv", "n_jobs", "Patience",
"Gridsearch", "n_iter", "Metric", "Split type", "Split size", 'Model', 'Data config']
result_cols = ['balanced_accuracy_test', 'auc_test', 'auc_val']
# result_cols = ['r2_test', 'r2_val', 'mae_test', 'mae_val']
# plots = ["history", "AUC", "gridsearch", "feature_importance", "feature_importance_full", "results", "TSNE", "PCA", "correlation", "distributions"]
plots = ["history", "AUC", "gridsearch", "results", "TSNE"]
callbacks = "stopping, csv_logger, checkpoint"
common_gridsearch = True
random_state = 1337
experiments_file = 'etc/test.csv'
verbose = 0
refit = False
n_jobs = 1 # multiprocessing.cpu_count() # only for evaluation
descriptors = [['rdkit', 'morgan', 'mordred', 'maccs'], ['rdkit'], ['morgan'], ['mordred'], ['maccs']]
splits = ['cluster', 'scaffold', 'random', 'stratified']
main(experiments_file, common_gridsearch, random_state, result_cols, keys, params, verbose, n_jobs, refit, plots, callbacks)
filenames = ["maccs", "rdkit", "mordred", "morgan", "rdkit_maccs", "rdkit_mordred", "morgan_maccs", "morgan_mordred", "rdkit_morgan", "mordred_maccs", "rdkit_morgan_mordred_maccs"]
titles = ['MACCS', 'RDKit', 'Mordred', 'Morgan', 'MACCS+RDKit', 'RDKit+Mordred', 'Morgan+MACCS', 'Morgan+Mordred', 'RDKit+Morgan', 'Mordred+MACCS', 'RDKit+Morgan+Mordred+MACCS']
# NAME = ["clintox_scaffold", "clintox_random", "clintox_cluster", "bace_scaffold", "bace_random", "bace_cluster"]
process_results(filenames, titles, [os.path.basename(experiments_file).split('.')[0]])