-
Notifications
You must be signed in to change notification settings - Fork 0
/
heartseek_analyze_hyperopt_binary_shap.py
123 lines (101 loc) · 3.99 KB
/
heartseek_analyze_hyperopt_binary_shap.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# -*- coding: utf-8 -*-
"""
Collect iterations for a repeated nested stratified k-fold cross
validation for binary one-vs-all classification using the selected RF classifier
with hyperparameters previously optimized for the performance metric of choice
and extract SHAP values from each iteration for the positive class. Average all
SHAP values for each subject and plot beeswarm plots based on them.
Usage:
heartseek_analyze_hyperopt_binary_shap.py <curr_cat> <classifier> <scorfunc> <rep_size> <outer_size> <inner_size>
Arguments:
<curr_cat> Current positive category
<classifier> Classifier
<scorfunc> Scoring function for hyperopt if doing it
<rep_size> Number of repetitions
<outer_size> Number of outer folds
<inner_size> Number of inner folds
"""
import os, shap
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from docopt import docopt
#Set current positive category, classifier, metric to optimize hyperparameters,
#CV parameters, and current CV repetition and outer fold.
args = docopt(__doc__)
curr_cat = args['<curr_cat>']
classifier = args['<classifier>']
scorfunc = args['<scorfunc>']
rep_size = args['<rep_size>']
outer_size = args['<outer_size>']
inner_size = args['<inner_size>']
print(curr_cat,classifier,scorfunc,rep_size,outer_size,inner_size)
#Set base path and output path.
basepath = ('binary_'+classifier+'_'+scorfunc+'_r'+rep_size+'_fo'+outer_size+'_fi'+inner_size+'/')
outpath = (basepath+'gather/')
os.makedirs(outpath,exist_ok=True)
#Set numeric arguments including current positive category, CV repetitions,
#CV outer fold number, and CV inner fold number.
ccat = int(curr_cat)
nrep = int(rep_size)
inner_k = int(inner_size)
outer_k = int(outer_size)
#Read in the input matrix.
infile = 'heartseek_XY.csv'
inmat = pd.read_csv(infile)
#Produce labels and extract dimensions.
ylabs = ['tr_labels','tr_idx']
xlabs = [x for x in inmat.columns if x not in ylabs]
nfeat = len(xlabs)
nsample = inmat.shape[0]
ycat = ['good-prognosis','remitting-course','clinical-worsening','persistent-course']
ncat = len(ycat)
#Divide input matrix into X features and Y label based on the current positive
#category.
data_X = inmat.loc[:,xlabs]
data_Y = (inmat.loc[:,'tr_idx']==ccat).astype(int)
data_Y.name = ycat[ccat]
#Define repetition and outer fold labels.
rk_labs = []
for ridx in range(nrep):
for outidx in range(outer_k):
rk_labs.append('r'+str(ridx+1)+'_f'+str(outidx+1))
nrk = len(rk_labs)
#Collect the sum of feature importances per sample for the positive class and
#the count to average later.
nsub = data_X.shape[0]
sublist = data_X.index.values
subsum = pd.DataFrame(np.zeros((nsub,nfeat)),index=sublist,columns=xlabs)
subcount = pd.Series(np.zeros((nsub)),index=sublist)
#Fill in collectors.
for ridx in range(nrep):
for outidx in range(outer_k):
rk_lab = 'r'+str(ridx+1)+'_f'+str(outidx+1)
#Open store.
infile = (basepath+curr_cat+'_binary_'+rk_lab+'.h5')
instore = pd.HDFStore(infile,'r')
#Extract feature importance for the positive class.
inlab = 'shvals_p'
inkey = ('/'+inlab+'_'+rk_lab)
shvals_p = instore.select(inkey)
instore.close()
#Add features and count by subject.
csublist = shvals_p.index
subsum.loc[csublist,:] += shvals_p
subcount.loc[csublist] += 1
#Divide the sum by the count to get the average.
submean = subsum.div(subcount,axis=0)
#Save the average SHAP values.
outfile = (outpath+curr_cat+'_feature_shap_1.csv')
submean.to_csv(outfile,index=False)
#Read in the impurity-based feature importance for the sorting.
infile = (outpath+curr_cat+'_feature.csv')
inmat = pd.read_csv(infile,index_col=0)
xlabs_order = inmat.index.values[::-1]
#Generate the beeswarm plot.
shap.summary_plot(submean.loc[:,xlabs_order].values,data_X.loc[:,xlabs_order],
show=False,sort=False,max_display=nfeat)
outfile = (outpath+curr_cat+'_beeswarm_shap_1.jpg')
plt.savefig(outfile,bbox_inches='tight',dpi=720)
plt.close()
print('Saved.')