-
Notifications
You must be signed in to change notification settings - Fork 0
/
5_feature_selection_BPE.R
76 lines (63 loc) · 2.46 KB
/
5_feature_selection_BPE.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
library(tidyverse)
# library(DataExplorer)
source('prep_imaging.R') # sources the clinical data as well
fgt.cleaned$Patient_ID <- NULL
fgt.cleaned$rcd_neoadj <- df.merged$Received_Neoadjuvant
### TEST MULTICOLLINEARITY #########################################################################
## Variance Inflation Factors ------------------------------------------------------------------------
# library('olsrr')
#
# # Create linear regression model
# model.all <- lm(
# as.numeric(rcd_neoadj) ~ .,
# data = fgt.cleaned
# )
# summary(model.all)
#
# testVIF <- ols_vif_tol(model.all)
# testVIF <- testVIF[order(testVIF$VIF), ]
# keep.variables <- subset(testVIF, VIF < 7, select = Variables) # Need source to say why
#
# # Remove features with severe multicollinearity
# fgt.selected <- subset(fgt.cleaned, select = names(fgt.cleaned) %in% keep.variables$Variables)
# fgt.selected$rcd_neoadj <- fgt.cleaned$rcd_neoadj
### SELECTION USING ALGORITHMS #####################################################################
# Logit --------------------------------------------------------------------------------------------
library(glmnet)
logit.model <- glm(rcd_neoadj ~., family = 'binomial', data = fgt.selected)
options(scipen=999)
summary(logit.model)
var.importance <- caret::varImp(logit.model)
var.importance$var <- rownames(var.importance)
var.importance <- var.importance[order(var.importance$Overall, decreasing=TRUE), ]
row.names(var.importance) <- NULL
importance <- varImp(logit.model, scale=FALSE)
# summarize importance
print(importance)
# plot importance
plot(var.importance)
### PCA ############################################################################################
# library(devtools)
library(ggbiplot)
img.pca <- prcomp(fgt.selected[,-length(fgt.selected)])
summary(img.pca)
ggbiplot(img.pca,
obs.scale = 1,
var.scale = 1,
groups = clinical$Received_Neoadjuvant,
ellipse = TRUE,
var.axes=FALSE
)
# ggtitle('PCA Plot from 37 Features')
### CLUSTERING #####################################################################################
neoadj <- clinical[, c("Received_Neoadjuvant", "Known_Ovarian_Status")]
neoadj$colors_neo <- ifelse(neoadj$Received_Neoadjuvant == 0, 'blue', 'red')
heatmap(
data.matrix(fgt.selected[,-length(fgt.selected)]),
margins = c(3,2),
RowSideColors = neoadj$colors,
xlab = 'BPE features',
ylab = 'Sample',
labRow = c(''),
labCol = c('')
)