-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathbuild_importance_ci_revisit.R
53 lines (45 loc) · 1.43 KB
/
build_importance_ci_revisit.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
#Get mean information gain for each variable from 100 runs of XGBoost (9 days, full dataset)
setwd('~/ED_Return/')
library(readr)
library(plyr)
library(dplyr)
library(reshape2)
library(parallel)
library(caret)
library(xgboost)
library(doMC)
library(pROC)
registerDoMC(5) #for parallelization
load('./Results/indeces_revisit.RData')
load('./Results/sparseMatrix_revisit.RData')
x <- dataset$x
y <- dataset$y
rm(dataset)
#prepare matrices for XGBoost
x_train <- x[-indeces$i_test,]
y_train <- y[-indeces$i_test]
x_test <- x[indeces$i_test,]
y_test <- y[indeces$i_test]
rm(x); rm(y)
for (i in 1:100) {
bst <- xgboost(data = x_train, label = y_train,
max_depth = 20, eta = 0.3,
nthread = 5, nrounds = 20,
eval_metric = 'auc',
objective = "binary:logistic",
colsample_bylevel = 0.03)
# get importance table
importance <- xgb.importance(feature_names = x_train@Dimnames[[2]], model = bst)
#extract gain
importance <- importance[,c(1,2)]
#change name of column
label <- paste0("Gain", i)
names(importance)[2] <- label
if (i == 1) {
result <- importance
} else {
result <- left_join(result, importance, by = 'Feature')
}
print(paste("Finished iteration",i))
}
save(result, file = './Results/bst_importance_100_revisit.RData')