-
Notifications
You must be signed in to change notification settings - Fork 0
/
Written Report.Rmd
454 lines (344 loc) · 13.6 KB
/
Written Report.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
---
title: 'Report title'
output:
html_notebook:
theme: flatly
toc: yes
pdf_document:
toc: yes
word_document:
toc: yes
html_document:
df_print: paged
toc: yes
author: 'Author name'
date: "`r format(Sys.time(), '%d %B, %Y')`"
---
```{r, include=FALSE}
options(tinytex.verbose = TRUE)
```
```{r, include = FALSE, message=FALSE}
library(webshot)
if(is_phantomjs_installed() == FALSE){
install_phantomjs()
}
library(tidymodels)
library(tidyverse)
library(inspectdf)
library(skimr)
library(table1)
library(kableExtra)
library(glmnet)
library(kknn)
library(ranger)
library(xgboost)
library(vip)
```
## Problem description
Describe the problem you are trying to solve and why it is worth solving.
## Brief introduction to the data
Write a short paragraph describing the data you are using, including where the data came from.
## Data exploration
Describe and explore the data using R, using tables, graphics etc.
```{r}
load("train_predictors_outcome.RData")
load("test_predictors.RData")
#Still missing values?
show_plot(inspect_na(train_predictors_outcome))
```
```{r}
train_predictors_outcome %>%
count(h1n1_vaccine) %>%
mutate(prop = n/sum(n))
```
```{r}
set.seed(82734)
#Make a recipe for how the imputation will be done (specify the imputation model)
impute_recipe <- recipe(h1n1_vaccine ~ ., data = train_predictors_outcome) %>% #Use hotels_train data with children as outcome and the others as predictors
update_role(respondent_id, new_role = "ID") %>%
step_knnimpute(all_predictors(), neighbors = 10) # Use step_knnimpute with 3 neighbors as imputatition method
#Prep the hotels_train data using the imputation recipe
impute_prep <- prep(impute_recipe, training = train_predictors_outcome)
#Bake (compute/retrieve) the imputed data by using the prep above on the data
train_imp <- bake(impute_prep, train_predictors_outcome)
#Save for later
save(train_imp, file = "train_imp.RData")
```
```{r}
show_plot(inspect_na(train_imp))
```
```{r}
########################################################################
# RECIPE FOR PREPROCESSING
#######################################################################
train_rec <-
recipe(h1n1_vaccine ~ ., data = train_imp) %>%
update_role(respondent_id, new_role = "ID")%>% #
step_normalize(all_numeric()) %>% #Center and scale all numeric variables
step_dummy(all_nominal(), -h1n1_vaccine) %>% #Recode all factors
themis::step_downsample(h1n1_vaccine) #Downsample the majority class to have a more balanced outcome
```
## Analysis
Describe the analysis. State which prediction methods/models you are implementing. Describe the setup of the analysis, including any sample splitting, resampling strategy, metric used for evaluating a prediction strategy.
```{r}
########################################################################
# CREATE CROSS-VALIDATION FOLDS FOR MODEL EVALUATION/COMPARISON
########################################################################
#Prepare for 10-fold cross-validation, observations selected into folds with random
#sampling stratified on outcome
set.seed(5000)
folds <- vfold_cv(train_imp, v = 10, strata = h1n1_vaccine)
########################################################################
# EVALUATION METRICS
########################################################################
#Which metrics should be computed?
my_metrics <- metric_set(roc_auc, precision, recall, specificity, accuracy, bal_accuracy, f_meas)
```
```{r, warning=FALSE}
########################################################################
# MODELLING
########################################################################
########################################
# MODEL 1: Logistic regression
########################################
#Model specification
lr_mod <-
logistic_reg() %>%
set_engine("glm")
#Work flow: Which model to use and how data should be preprocessed
lr_wflow <-
workflow() %>%
add_model(lr_mod) %>%
add_recipe(train_rec)
#Use the workflow and folds object to fit model on cross-validation resamples
lr_fit_rs <-
lr_wflow %>%
fit_resamples(folds, metrics = my_metrics, control = control_resamples(save_pred = TRUE))
#Get mean out-of-sample performance measures
lr_metrics <- collect_metrics(lr_fit_rs)
lr_metrics
#Store part of the metrics object for later comparison with other models
lr_metrics_sub <- lr_metrics[ , c(1,3,5)]
lr_metrics_sub <- lr_metrics_sub %>%
pivot_longer(!.metric, names_to = "measure", values_to = ".estimate")
#Fit the above logistic regression model on the full training data
lr_fit_train <-
lr_wflow %>%
fit(data = train_imp)
#Look at the model summary
summary(lr_fit_train$fit$fit$fit)
#Get the predicted class probabilities computed for the full training data
lr_pred_prob_train <- predict(lr_fit_train , type = "prob", new_data = train_imp)
#Get the receiver operator curve (ROC) computed for the full training data
lr_train_roc <- roc_curve(tibble(h1n1_vaccine = train_imp$h1n1_vaccine, lr_pred_prob_train), truth = h1n1_vaccine, estimate =.pred_Yes) %>%
mutate(model = "Log_reg")
#When you have test data without outcome
lr_pred_class_test_no_outcome <- predict(lr_fit_train , type = "class", new_data = test_predictors)
lr_pred_prob_test_no_outcome <- predict(lr_fit_train , type = "prob", new_data = test_predictors)
```
```{r}
############################################################
#CREATE CROSS-VALIDATION FOLDS FOR HYPERPARAMETER TUNING
###########################################################
#Prepare for hyperparameter selection by 10-fold cross-validation, observations selected into folds with random
#sampling stratified on outcome
set.seed(1981)
tune_folds <- vfold_cv(train_imp, v = 10, strata = h1n1_vaccine)
#Set metric for choosing hyperparameter
roc_res <- metric_set(roc_auc)
```
```{r, warning=FALSE}
################################################
# MODEL 2: Penalized logistic regression (LASSO)
################################################
#Model specification
penlr_mod <-
logistic_reg(mixture = 1, penalty = tune()) %>% #Specify that we want to tune the penalty parameter
set_engine("glmnet") %>%
set_mode("classification")
#Set up workflow
penlr_wflow <-
workflow() %>%
add_model(penlr_mod) %>%
add_recipe(train_rec)
#Get a parameter object for our data and model specification. Contains information about possible values, ranges, types etc.
penlr_param <-
penlr_wflow %>%
parameters() %>%
finalize(train_imp)
#Look at the range for the penalty parameter
penlr_param%>% pull_dials_object("penalty")
#Tune the model: Set up a grid of penalty values to be evalutated and select the optimal penalty value (in terms of AUROC)
set.seed(666)
penlr_tune <-
penlr_wflow %>%
tune_grid(
tune_folds,
grid = penlr_param %>% grid_regular(levels = c(penalty = 100)),
metrics = roc_res
)
save(penlr_tune, file = "penlr_tune.RData")
#load("penlr_tune.RData")
#View plot of penalty values vs. AUROC
autoplot(penlr_tune) + theme(legend.position = "top")
#View the penalty values with largest AUROC
show_best(penlr_tune) %>% select(-.estimator)
#Store the best penalty value
penlr_best_param <- select_best(penlr_tune, "roc_auc")
#Set up the final workflow using the best penalty value
final_penlr_wflow <-
penlr_wflow %>%
finalize_workflow(penlr_best_param)
#View the workflow specifiations
final_penlr_wflow
#Fit the final model on the cross-validation folds set up for model evaluation/comparison
penlr_fit_rs <-
final_penlr_wflow %>%
fit_resamples(folds, metrics = my_metrics, control = control_resamples(save_pred = TRUE))
#Get mean out-of-sample performance measures
penlr_metrics <- collect_metrics(penlr_fit_rs)
penlr_metrics
#Store part of the metrics object for later comparison with other models
penlr_metrics_sub <- penlr_metrics[, c(1,3,5)]
penlr_metrics_sub <- penlr_metrics_sub %>%
pivot_longer(!.metric, names_to = "measure", values_to = "estimate")
#Fit the final model on the full training data
penlr_fit_train <-
final_penlr_wflow %>%
fit(data = train_imp)
#Look at variable importance
penlr_fit_train%>%
pull_workflow_fit() %>%
vip(lambda = penlr_best_param$penalty, num_features = 200)
#Get the model coefficients
penlr_coeff <- data.frame(penlr_fit_train %>%
pull_workflow_fit() %>%
tidy())
#Number of non-zero coefficients
sum(penlr_coeff$estimate != 0)
#Number of zero coefficients
sum(penlr_coeff$estimate == 0)
#Get the predicted class probabilities computed for the full training data
penlr_pred_prob_train <- predict(penlr_fit_train , type = "prob", new_data = train_imp)
#Get the receiver operator curve (ROC) computed for the full training data
penlr_train_roc <- roc_curve(tibble(h1n1_vaccine = train_imp$h1n1_vaccine, penlr_pred_prob_train), truth = h1n1_vaccine, estimate =.pred_Yes) %>%
mutate(model = "Pen_log_reg")
penlr_pred_class_test_no_outcome <- predict(penlr_fit_train , type = "class", new_data = test_predictors)
penlr_pred_prob_test_no_outcome <- predict(penlr_fit_train , type = "prob", new_data = test_predictors)
```
```{r, warning=FALSE}
################################################
# MODEL 3: Nearest neighbors
################################################
#Model specification
knn_mod <-
nearest_neighbor(neighbors = tune()) %>% #Specify that we want to tune the neighbors parameter
set_engine("kknn") %>%
set_mode("classification")
#Work flow: Which model to use and how data should be preprocessed
knn_wflow <-
workflow() %>%
add_model(knn_mod) %>%
add_recipe(train_rec)
#Get a parameter object for our data and model specification
knn_param <-
knn_wflow %>%
parameters() %>%
finalize(train_imp)
#Look at the range for the neighbor parameter
knn_param%>% pull_dials_object("neighbors")
#Update the range for possible neighbor values
knn_param <-
knn_wflow %>%
parameters() %>%
update(neighbors = neighbors(c(1, 500))) %>%
finalize(train_imp)
#Look at the updated range for the neighbor parameter
knn_param%>% pull_dials_object("neighbors")
#Tune the model: Set up a grid of neighbor values to be evalutated and select the optimal number of neighbors (in terms of AUROC)
set.seed(9923323)
knn_tune <-
knn_wflow %>%
tune_grid(
tune_folds,
grid = knn_param %>% grid_regular(levels = c(neighbors = 20)),
metrics = roc_res
)
save(knn_tune, file = "knn_tune.RData")
#load("knn_tune.RData")
#View plot of number of neighbors vs. AUROC
autoplot(knn_tune) + theme(legend.position = "top")
#View the number of neighbors with largest AUROC
show_best(knn_tune) %>% select(-.estimator)
#Store the best neighbor value
knn_best_param <- select_best(knn_tune, "roc_auc")
#Set up the final workflow using the best neighbor value
final_knn_wflow <-
knn_wflow %>%
finalize_workflow(knn_best_param)
#Fit the final model on the cross-validation folds set up for model evaluation/comparison
knn_fit_rs <-
final_knn_wflow %>%
fit_resamples(folds, metrics = my_metrics, control = control_resamples(save_pred = TRUE))
#Get mean out-of-sample performance, measured as accuracy and area under the receiver operator curve (AUROC)
knn_metrics <- collect_metrics(knn_fit_rs)
knn_metrics
#Store some of the results for later comparison with other models
knn_metrics_sub <- knn_metrics[ , c(1,3,5)]
knn_metrics_sub <- knn_metrics_sub %>%
pivot_longer(!.metric, names_to = "measure", values_to = "estimate")
#Fit the final model on the full training data
knn_fit_train <-
final_knn_wflow %>%
fit(data = train_imp)
#Get the predicted class probabilities computed for the full training data
knn_pred_prob_train <- predict(knn_fit_train , type = "prob", new_data = train_imp)
#Get the receiver operator curve (ROC) computed for the full training data
knn_train_roc <- roc_curve(tibble(h1n1_vaccine = train_imp$h1n1_vaccine, penlr_pred_prob_train), truth = h1n1_vaccine, estimate =.pred_Yes) %>%
mutate(model = "KNN")
#Get predicted class (outcome) and class probabilities for the test data
#When you have test data with outcome
#knn_pred_class_test <- predict(knn_fit_train , type = "class", new_data = test_imp)
#knn_pred_prob_test <- predict(knn_fit_train , type = "prob", new_data = test_imp)
#When you have test data without outcome
knn_pred_class_test_no_outcome <- predict(knn_fit_train , type = "class", new_data = test_predictors)
knn_pred_prob_test_no_outcome <- predict(knn_fit_train , type = "prob", new_data = test_predictors)
```
```{r}
################################################
# MODEL 4: Random forest
################################################
#Preprocessing without creating dummies
train_rec_rf <-
recipe(h1n1_vaccine ~ ., data = train_imp) %>%
update_role(respondent_id, new_role = "ID")%>% #
step_normalize(all_numeric()) %>% #Center and scale all numeric
themis::step_downsample(h1n1_vaccine)
#Model specification
rf_mod <-
rand_forest(mtry = tune(), trees = tune()) %>% #Specify that we want to tune both the mtry and trees parameters
set_engine("ranger", importance = "impurity") %>%
set_mode("classification")
#Work flow: Which model to use and how data should be preprocessed
rf_wflow <-
workflow() %>%
add_model(rf_mod) %>%
add_recipe(train_rec_rf)
#Get a parameter object for our data and model specification, where we also update the ranges of mtry and trees
rf_param <-
rf_wflow %>%
parameters() %>%
update(mtry = mtry(c(1, 16))) %>%
update(trees = trees(c(500, 1500))) %>%
finalize(train_imp)
#Tune the model
set.seed(911)
rf_tune <-
rf_wflow %>%
tune_grid(
tune_folds,
grid = rf_param %>% grid_regular(levels = c(mtry = 4, trees = 3)),
metrics = roc_res
)
save(rf_tune, file = "rf_tune.RData")
```