-
Notifications
You must be signed in to change notification settings - Fork 2
/
cpt_anno_organ.py
189 lines (150 loc) · 7.83 KB
/
cpt_anno_organ.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
import numpy as np
import pandas as pd
import os
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
# DESCRIPTION: THIS SCRIPT GENERATES AUC SCORES FOR THE AGGREGATE AND SUB MODELS.
# THE SUBMODELS ARE DEFINED BY THEIR ORGAN GROUP, NOT INDIVIDUAL CPT CODE
# SAVES TO OUTPUT:
# --- logit_agg_organ.csv
# --- logit_sub_organ.csv
###############################
# ---- STEP 1: LOAD DATA ---- #
dir_base = os.getcwd()
dir_output = os.path.join(dir_base, '..', 'output')
dir_figures = os.path.join(dir_base, '..', 'figures')
fn_X = 'X_imputed.csv'
fn_Y = 'y_agg.csv'
dat_X = pd.read_csv(os.path.join(dir_output, fn_X))
dat_Y = pd.read_csv(os.path.join(dir_output, fn_Y))
# CREATE DUMMY VARIABLES FOR NON NUMERIC
dat_X = pd.get_dummies(dat_X)
# !! ENCODE CPT AS CATEGORICAL !! #
dat_X['cpt'] = 'c' + dat_X.cpt.astype(str)
# GET CPT ANNOTATIONS
file_name = 'cpt_anno_organ.csv'
cpt_anno = pd.read_csv(os.path.join(dir_output, file_name))
cpt_anno['cpt'] = 'c' + cpt_anno.cpt.astype(str)
# GROUP BY TITLE AND GET COUNTS - REMOVE GROUPS WITH ONLY 1 COUNT
cpt_groups = cpt_anno.groupby('organ').size().sort_values(ascending=False)
cpt_groups = pd.DataFrame({'cpt_title': cpt_groups.index, 'count': cpt_groups.values})
# KEEP CPT TITLES THAT HAVE MORE THAN ONE CPT CODE ASSOCIATED WITH THEM
top_groups = cpt_groups[cpt_groups['count'] > 1]
top_groups = top_groups.cpt_title.unique()
# SUBSET CPT GROUPS TO KEEP ONLY CPTS THAT HAVE A CORRESPONDING TITLE
cpt_anno = cpt_anno[cpt_anno.organ.isin(top_groups)].reset_index(drop=True)
top_cpts = cpt_anno.cpt.unique()
# SUBET BY DATA FRAMES BY CPT CODES
dat_X = dat_X[dat_X.cpt.isin(top_cpts)].reset_index(drop=True)
dat_Y = dat_Y[dat_Y.caseid.isin(dat_X.caseid)].reset_index(drop=True)
# GET COLUMNS
cn_X = list(dat_X.columns[2:])
cn_Y = list(dat_Y.columns[25:37])
# DELETE NON AGG LABELS
dat_Y.drop(dat_Y.columns[[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]],
axis=1, inplace=True)
###############################################
# ---- STEP 2: LEAVE-ONE-YEAR - ALL VARIABLES ---- #
# START LOOP
holder_y_all = []
for ii, vv in enumerate(cn_Y):
print('##### ------- Outcome %s (%i of %i) -------- #####' % (vv, ii + 1, len(cn_Y)))
tmp_ii = pd.concat([dat_Y.operyr, dat_Y[vv] == -1], axis=1)
tmp_ii = tmp_ii.groupby('operyr')[vv].apply(np.sum).reset_index().rename(columns={vv: 'n'})
tmp_years = tmp_ii[tmp_ii.n == 0].operyr.values
tmp_years = tmp_years.astype(int)
tmp_train_years = tmp_years[tmp_years > (tmp_years.min())]
holder_y = []
for yy in tmp_train_years:
print('Train Year %i' % (yy))
idx_train = dat_X.operyr.isin(tmp_years) & (dat_X.operyr < yy)
idx_test = dat_X.operyr.isin(tmp_years) & (dat_X.operyr == yy)
# get dummies
Xtrain, Xtest = dat_X.loc[idx_train, cn_X].reset_index(drop=True), \
dat_X.loc[idx_test, cn_X].reset_index(drop=True)
ytrain, ytest = dat_Y.loc[idx_train, [vv]].reset_index(drop=True), \
dat_Y.loc[idx_test, [vv]].reset_index(drop=True)
# STORE CPTS
tmp_cpt = Xtest.cpt
# REMOVE CPTS
del Xtrain['cpt']
del Xtest['cpt']
# TRAIN MODEL
logisticreg = LogisticRegression(solver='liblinear', max_iter=200)
logit_fit = logisticreg.fit(Xtrain, ytrain.values.ravel())
# PREDICT
logit_preds = logit_fit.predict_proba(Xtest)[:, 1]
# STORE RESULTS FROM AGGREGATE MODEL
tmp_holder = pd.DataFrame({'y_preds': list(logit_preds), 'y_values': list(ytest.values), 'cpt': list(tmp_cpt)})
# GET ORGAN GROUPS
top_titles = cpt_anno.organ.unique()
within_holder = []
# LOOP THROUGH EACH ORGAN GROUP
for cc in top_titles:
# SUBSET DATA BY ORGAN GROUP
title_cpts = cpt_anno[cpt_anno['organ']==cc].reset_index(drop=True)
title_cpts = title_cpts.cpt.unique()
sub_tmp_holder = tmp_holder[tmp_holder['cpt'].isin(title_cpts)].reset_index(drop=True)
if all(sub_tmp_holder.y_values.values == 0) or len(sub_tmp_holder.y_values.values) <= 1:
within_holder.append(pd.DataFrame({'auc': 'NA',
'cpt': cc}, index=[0]))
else:
within_holder.append(pd.DataFrame({'auc': metrics.roc_auc_score(list(sub_tmp_holder.y_values.values),
list(sub_tmp_holder.y_preds.values)),
'cpt': cc}, index=[0]))
holder_y.append(pd.concat(within_holder).assign(test_year=yy))
holder_y_all.append(pd.concat(holder_y).assign(outcome=vv))
res_y_all = pd.concat(holder_y_all).reset_index(drop=True)
res_y_all.to_csv(os.path.join(dir_output, 'agg_cpt_organ.csv'), index=False)
####################################################
# ---- STEP 3: LEAVE-ONE-YEAR - ALL VARIABLES, FOR EACH CPT CODE, SUB MODELS---- #
holder_y_all = []
for ii, vv in enumerate(cn_Y):
print('##### ------- Outcome %s (%i of %i) -------- #####' % (vv, ii + 1, len(cn_Y)))
tmp_ii = pd.concat([dat_Y.operyr, dat_Y[vv] == -1], axis=1)
tmp_ii = tmp_ii.groupby('operyr')[vv].apply(np.sum).reset_index().rename(columns={vv: 'n'})
tmp_years = tmp_ii[tmp_ii.n == 0].operyr.values
tmp_years = tmp_years.astype(int)
tmp_train_years = tmp_years[tmp_years > (tmp_years.min())]
holder_y = []
for yy in tmp_train_years:
print('Train Year %i' % (yy))
idx_train = dat_X.operyr.isin(tmp_years) & (dat_X.operyr < yy)
idx_test = dat_X.operyr.isin(tmp_years) & (dat_X.operyr == yy)
Xtrain, Xtest = dat_X.loc[idx_train, cn_X].reset_index(drop=True), \
dat_X.loc[idx_test, cn_X].reset_index(drop=True)
ytrain, ytest = dat_Y.loc[idx_train, [vv]].reset_index(drop=True), \
dat_Y.loc[idx_test, [vv]].reset_index(drop=True)
# GET ORGAN GROUPS
top_titles = cpt_anno.organ.unique()
within_holder = []
# LOOP THROUGH ORGAN GROUPS
for cc in top_titles:
# SUSET BY ORGAN
title_cpts = cpt_anno[cpt_anno['organ'] == cc].reset_index(drop=True)
title_cpts = list(title_cpts.cpt.unique())
# SUBSET XTRAIN AND XTEST BY CPT CODE
sub_xtrain = Xtrain[Xtrain['cpt'].isin(title_cpts)]
sub_xtest = Xtest[Xtest['cpt'].isin(title_cpts)]
# SUBSET YTRAIN AND YTEST BY THE CORRESPONDING INDICES IN SUBSETTED XDATA
sub_ytrain = ytrain[ytrain.index.isin(sub_xtrain.index)]
sub_ytest = ytest[ytest.index.isin(sub_xtest.index)]
# REMOVE CPT COLUMN
del sub_xtrain['cpt']
del sub_xtest['cpt']
# FILL RESULTS WITH NA IF TRAIN OR TEST OUTCOMES ARE ALL ONE VALUE
if all(np.unique(sub_ytrain.values) == 0) or all(np.unique(sub_ytest.values) == 0) or all(sub_ytest.values == 1) or len(sub_ytest.values) <= 1:
within_holder.append(pd.DataFrame({'auc': 'NA',
'cpt': cc}, index=[0]))
else:
# TRAIN MODEL
logisticreg = LogisticRegression(solver='liblinear', max_iter=200)
logit_fit = logisticreg.fit(sub_xtrain, sub_ytrain.values.ravel())
# TEST MODEL
logit_preds = logit_fit.predict_proba(sub_xtest)[:, 1]
within_holder.append(
pd.DataFrame({'auc': metrics.roc_auc_score(sub_ytest.values, logit_preds), 'cpt': cc}, index=[0]))
holder_y.append(pd.concat(within_holder).assign(test_year=yy))
holder_y_all.append(pd.concat(holder_y).assign(outcome=vv))
res_y_all = pd.concat(holder_y_all).reset_index(drop=True)
res_y_all.to_csv(os.path.join(dir_output, 'sub_cpt_organ.csv'), index=False)