forked from ed-wilkes/predictive-modelling
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathClinChem_2020-317479_Supplemental_Analysis_Guide.R
396 lines (346 loc) · 23.3 KB
/
ClinChem_2020-317479_Supplemental_Analysis_Guide.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
###############################################################################################
## Supplementary material to:
## A machine learning approach for the auomated interpretation of plasma amino acid profiles
## Wilkes EH, Emmett E, Beltran L, Woodward GM, Carling RS
## Clinical Chemistry (2020)
##
## Description:
## This script aims to guide readers through the analyses performed within the above paper.
## For clarity, the script does not include the WSRF and XGB analyses; however, comments
## are provided for the function arguments that need to be changed in order to reproduce these.
## Note that the WSRF and XGB algorithms take longer to train than RF.
###############################################################################################
## Load required packages ----
library(Boruta)
library(doParallel)
library(doRNG)
library(dplyr)
library(foreach)
library(gplots)
library(ggplot2)
library(MLmetrics)
library(reshape2)
library(tsne)
## Load raw data ----
df_model_binary <- read.csv(file.choose(), header = TRUE) # binary classification data
df_model_all <- read.csv(file.choose(), header = TRUE) # multi-class data
## t-SNE analysis ----
# We must first create a copy of the data and encode the categorical variables (SEX, ASA, Allo, and Hcys) as integers
df_model_encode <- df_model_all %>%
select(-SID) %>%
mutate(
SEX = ifelse(SEX == "F", 0, ifelse(SEX == "M", 1, ifelse(SEX == "U", 2, NA)))
,ASA = ifelse(ASA == "N", 0, ifelse(ASA == "Y", 1, NA))
,Allo = ifelse(Allo == "N", 0, ifelse(Allo == "Y", 1, NA))
,Hcys = ifelse(Hcys == "N", 0, ifelse(Hcys == "Y", 1, NA))
)
# Loops across several values of perplexity and performs t-SNE analysis
set.seed(123) # It is important to set the random seed prior to analysis for reproducibility
iterations <- c(5, 10, 25, 50) # the values of perplexity we will use
num_cores <- 4 # reduce this value if your PC has < 4 virtual cores
cluster <- makeCluster(num_cores) # create a multi-core cluster to speed up the analysis
registerDoParallel(cluster)
df_tsne <- foreach(i = iter(iterations)
,.combine = rbind
,.options.RNG = 123
,.packages = "dplyr") %dorng% {
tsne_results <- tsne::tsne(df_model_encode %>%
select(-ID
,-Class
,-Class.Category)
,perplexity = i
,max_iter = 5000
,k = 3) # maximum of three dimensions
df_results <- data.frame(tsne_results
,Class = df_model_encode$Class
,Class.Category = df_model_encode$Class.Category
,Perp = i)
return(df_results)
}
stopCluster(cluster)
# Create a plot of the results
colours <- rainbow(20)
colours[20] <- "grey80" # make the "normal" class grey for clarity
plot_1 <- ggplot(df_tsne, aes(x = X1, y = X2, colour = Class))+
geom_point(alpha = 1)+
facet_wrap(~Perp)+
xlab("Dimension 1")+
ylab("Dimension 2")+
scale_colour_manual(values = colours)+
theme_bw()+
theme(legend.position = "none") # remove the legend for clarity
# Visualise t-SNE (perplexity = 25) in three dimensions
colours_3d <- data.frame(Class = levels(df_tsne$Class), color = colours)
rgl::plot3d(x = filter(df_tsne, Perp == "Perplexity = 25")$X1
,y = filter(df_tsne, Perp == "Perplexity = 25")$X2
,z = filter(df_tsne, Perp == "Perplexity = 25")$X3
,col = colours_3d$color[match(df_tsne$Class, colours_3d$Class)]
,size = 8)
rgl::play3d(rgl::spin3d())
## Boruta analysis of the data set ----
# To gain an insight into the features that are most important for the class separation
# we must first define the borutaCVParallel function by running the "borutaCVParallel.R"
# script from:
# https://github.com/ed-wilkes/predictive-modelling/blob/master/borutaCVParallel.R
num_cores <- 10 # change this parameter depending on how many virtual cores your PC has
# NB: It is best to leave some cores free (e.g., 12 - 2 = 10)
set.seed(123) # again, this is critical to reproducibility. The value doesn't matter, it just
# needs to be the same each time you perform an analysis
boruta_output <- borutaCVParallel(data = df_model_all[,-c(1, 30)] # remove ID and Class columns
,y = "Class"
,outer_k = 3
,outer_rep = 5
,seed = 123
,max_runs = 200
,n_cores = num_cores)
boruta_output$features # this provides the features selected in each fold/repeat
boruta_output$importance # this provides the feature importance values calculated for each fold/repeat
## Perform nested CV on the binary data with random forests +/- down-sampling +/- feature selection ----
# To perform the nested CV, we must first we must define the performNestedCVParallel
# function by running the "performNestedCVParallel.R" script from:
# https://github.com/ed-wilkes/predictive-modelling/blob/master/performNestedCVParallel.R
# The "method" argument can be changed to "wsrf" or "xgbTree" in order to fit
# WSRF and XGB models to the data
# Defining the nested CV parameters
k_outer <- 3 # number of folds in the outer loop
rep_outer <- 5 # number of repeats in the outer loop
k_inner <- 2 # number of folds in the inner loop
rep_inner <- 10 # number of repeats in the outer loop
# Running the first nested CV analysis
rf_model_binary <- performNestedCVParallel(data = df_model_binary[,-1] # remove the ID column
,y = "Class"
,method = "rf" # use the randomForest algorithm
,metric = "AUC" # assess performance with PRAUC
,outer_k = k_outer
,outer_rep = rep_outer
,inner_k = k_inner
,inner_rep = rep_inner
,sampling = "none"
,seed = 123
,verbose = FALSE
,n_cores = num_cores
,target_class = "X.Abnormal")
# View the results
rf_model_binary$resample # this provides the raw inner and outer loop results
rf_model_binary$prediction_class # this provides the hard classifications for each outer loop
rf_model_binary$prediction_prob # this provides the class probabilities for each outer loop
rf_model_binary$hyperparameters # this provides the selected hyperparameters from the inner loop
# Perform further nested CV analyses
rf_model_binary_down <- performNestedCVParallel(data = df_model_binary[,-1]
,y = "Class"
,method = "rf"
,metric = "AUC"
,outer_k = k_outer
,outer_rep = rep_outer
,inner_k = k_inner
,inner_rep = rep_inner
,sampling = "down" # use down-sampling
,seed = 123
,verbose = FALSE
,n_cores = num_cores
,target_class = "X.Abnormal")
rf_model_binary_select <- performNestedCVParallel(data = df_model_binary[,-1]
,y = "Class"
,method = "rf"
,metric = "AUC"
,outer_k = k_outer
,outer_rep = rep_outer
,inner_k = k_inner
,inner_rep = rep_inner
,sampling = "none"
,seed = 123
,verbose = FALSE
,feat_select = TRUE # include feature selection
,max_runs = 200 # perform a max of 200 Boruta runs
,n_cores = num_cores
,target_class = "X.Abnormal")
rf_model_binary_select$features # This provides the selected features for each outer loop
# This will be identical to boruta_output$features (assuming
# the same seed as been used each time)
rf_model_binary_down_select <- performNestedCVParallel(data = df_model_binary[,-1]
,y = "Class"
,method = "rf"
,metric = "AUC"
,outer_k = k_outer
,outer_rep = rep_outer
,inner_k = k_inner
,inner_rep = rep_inner
,sampling = "down" # use down-sampling
,seed = 123
,verbose = FALSE
,feat_select = TRUE # include feature selection
,max_runs = 200 # perform a max of 200 Boruta runs
,n_cores = num_cores
,target_class = "X.Abnormal")
## Perform nested CV on the multi-class data with random forests +/- down-sampling +/- feature selection ----
# To use the F4-score as an optimisation metric, we must first define the new
# caret F4 function by running the "multiClassSummaryF4.R" script from:
# https://github.com/ed-wilkes/predictive-modelling/blob/master/multiClassSummaryF4.R
# We also need to load in the custom caret sampling method ("customSample") from:
# https://github.com/ed-wilkes/predictive-modelling/blob/master/customSample.R
# Running the nested CV analyses
rf_model_multi <- performNestedCVParallel(data = df_model_all[,-1]
,y = "Class"
,method = "rf"
,metric = "Mean_F4" # assess performance with F4 score
,outer_k = k_outer
,outer_rep = rep_outer
,inner_k = k_inner
,inner_rep = rep_inner
,sampling = "none"
,seed = 123
,verbose = FALSE
,n_cores = num_cores)
rf_model_multi_down <- performNestedCVParallel(data = df_model_all[,-1]
,y = "Class"
,method = "rf"
,metric = "Mean_F4"
,outer_k = k_outer
,outer_rep = rep_outer
,inner_k = k_inner
,inner_rep = rep_inner
,sampling = "down" # down-sampling
,seed = 123
,verbose = FALSE
,n_cores = num_cores)
rf_model_multi_custom <- performNestedCVParallel(data = df_model_all[,-1]
,y = "Class"
,method = "rf"
,metric = "Mean_F4"
,outer_k = k_outer
,outer_rep = rep_outer
,inner_k = k_inner
,inner_rep = rep_inner
,sampling = "custom" # custom down-sampling
,seed = 123
,verbose = FALSE
,n_cores = num_cores)
rf_model_multi_select <- performNestedCVParallel(data = df_model_all[,-1]
,y = "Class"
,method = "rf"
,metric = "Mean_F4"
,outer_k = k_outer
,outer_rep = rep_outer
,inner_k = k_inner
,inner_rep = rep_inner
,sampling = "none"
,seed = 123
,verbose = FALSE
,feat_select = TRUE # include feature selection
,max_runs = 200 # perform a max of 200 Boruta runs
,n_cores = num_cores)
rf_model_multi_down_select <- performNestedCVParallel(data = df_model_all[,-1]
,y = "Class"
,method = "rf"
,metric = "Mean_F4"
,outer_k = k_outer
,outer_rep = rep_outer
,inner_k = k_inner
,inner_rep = rep_inner
,sampling = "down" # down-sampling
,seed = 123
,verbose = FALSE
,feat_select = TRUE # include feature selection
,max_runs = 200 # perform a max of 200 Boruta runs
,n_cores = num_cores)
rf_model_multi_custom_select <- performNestedCVParallel(data = df_model_all[,-1]
,y = "Class"
,method = "rf"
,metric = "Mean_F4"
,outer_k = k_outer
,outer_rep = rep_outer
,inner_k = k_inner
,inner_rep = rep_inner
,sampling = "custom" # custom down-sampling
,seed = 123
,verbose = FALSE
,feat_select = TRUE # include feature selection
,max_runs = 200 # perform a max of 200 Boruta runs
,n_cores = num_cores)
## Plot the modelling results ----
# You can access the results of the nested CV analysis through the saved output object;
# however, we can also use the convenient "plotConfMatrix.R" and "plotAUC.R" functions
# available from:
# https://github.com/ed-wilkes/predictive-modelling/blob/master/plotConfMatrix.R
# https://github.com/ed-wilkes/predictive-modelling/blob/master/plotAUC.R
# NB: plotConfMatrix computes the sum of the predictions across the folds within each
# repeat of the nested CV process and then averages these across the repeats
# To determine the best probability threshold for hard classification from the binary
# classifiers, we need to use the "findProbCutoff.R" and "predictBinaryClasses.R"
# functions available from:
# https://github.com/ed-wilkes/predictive-modelling/blob/master/findProbCutoff.R
# https://github.com/ed-wilkes/predictive-modelling/blob/master/predictBinaryClasses.R
# Plot binary analysis data
plot_2a <- plotAUC(data = rf_model_binary$prediction_prob
,y_measure = "ppv" # positive predictive value (precision)
,x_measure = "sens" # sensitivity (recall)
,order = c("No.significant.abnormality.detected.", "X.Abnormal"))
# Find the best class probability thresholds for each fold/repeat of the nested CV process
rf_model_binary_threshold <- findProbCutoff(rf_model_binary$prediction_prob
,order = c("No.significant.abnormality.detected."
,"X.Abnormal")
,measure_1 = "ppv"
,measure_2 = "sens"
,optimise = "F4") # optimise the threshold based on F4-score
# Make hard classification calls for each fold/repeat based on the tuned threshold
rf_model_binary_tuned <- predictBinaryClasses(predictions = rf_model_binary$prediction_prob
,thresholds = rf_model_binary_threshold
,target = "X.Abnormal")
plot_2b <- plotConfMatrix(data = rf_model_binary_tuned)
abline(h = 644 / (644 + 1440)) # Positive class / (Positive class + Negative class)
# Plot multi-class analysis data
plot_3 <- plotConfMatrix(data = rf_model_multi$prediction_class)
## Average prediction probabilities from multiple individual classifiers ----
# Train models with the WSRF algorithm
wsrf_model_binary <- performNestedCVParallel(data = df_model_binary[,-1]
,y = "Class"
,method = "wsrf" # use the WSRF algorithm
,metric = "AUC" # assess performance with PRAUC
,outer_k = k_outer
,outer_rep = rep_outer
,inner_k = k_inner
,inner_rep = rep_inner
,sampling = "none"
,seed = 123
,verbose = FALSE
,n_cores = num_cores
,target_class = "X.Abnormal")
# Train models with the XGB algorithm
xgb_model_binary <- performNestedCVParallel(data = df_model_binary[,-1]
,y = "Class"
,method = "xgbTree" # use the XGB algorithm
,metric = "AUC" # assess performance with PRAUC
,outer_k = k_outer
,outer_rep = rep_outer
,inner_k = k_inner
,inner_rep = rep_inner
,sampling = "none"
,seed = 123
,verbose = FALSE
,n_cores = num_cores
,target_class = "X.Abnormal")
# Collect prediction probabilities and melt them to long format
df_pred_model_1 <- data.frame(melt(rf_model_binary$prediction_prob
,id.vars = c("fold", "row_num", "obs"))
,model = "rf")
df_pred_model_2 <- data.frame(melt(rf_model_binary$prediction_prob
,id.vars = c("fold", "row_num", "obs"))
,model = "wsrf")
df_pred_model_3 <- data.frame(melt(rf_model_binary$prediction_prob
,id.vars = c("fold", "row_num", "obs"))
,model = "xgb")
# Combine the probabilities and calculate the average for each case in each fold/repeat
df_pred_all <- bind_rows(df_pred_model_1, df_pred_model_2, df_pred_model_3)
df_pred_averaged <- df_pred_all %>%
group_by(fold, obs, row_num, variable) %>%
summarise(mean_pred = mean(value)) %>% # calculate mean across rf, wsrf, and xgb probabilities
ungroup()
df_pred_all$fold <- factor(df_pred_all$fold, levels = unique(df_pred_all$fold))
# Split the data into a list based on fold/repeat number
list_pred <- split(df_pred_averaged, df_pred_averaged$fold)
# Calculate the PRAUC for each fold/repeat with the new, averaged probabilities
ensemble_prauc <- lapply(list_pred, function(x) {
auc <- MLmetrics::PRAUC(y_pred = x$mean_pred, y_true = x$obs)
}) %>% unlist
# Calculate the mean PRAUC
mean(ensemble_prauc)