-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy path3_booster_and_submissions.R
117 lines (84 loc) · 3.24 KB
/
3_booster_and_submissions.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
library(xgboost)
#------------------------------------------
# Reading the training data and the labels.
#------------------------------------------
train <- read.csv("./clean_dataset/train.csv", stringsAsFactors = FALSE)
test <- read.csv("./clean_dataset/test.csv", stringsAsFactors = FALSE)
#------------------------------------
# Reading the aggreagte data tables.
#------------------------------------
train_agg <- read.csv("./clean_dataset/train_aggregates.csv", stringsAsFactors = FALSE)
test_agg <- read.csv("./clean_dataset/test_aggregates.csv", stringsAsFactors = FALSE)
#----------------------------------------------------
# Joining the aggregate tables and the original ones.
#----------------------------------------------------
train <- cbind(train, train_agg)
test <- cbind(test,test_agg)
#-------------------------------
# Removing the aggregate tables.
#-------------------------------
rm(train_agg, test_agg)
#------------------------------
# Reading the target vectors.
#------------------------------
labels <- read.csv("./clean_dataset/target.csv", stringsAsFactors = FALSE)
#----------------------------------------------------------------------------------------------
# Missing values are encoded as negative values -- this only works with tree based algorithms.
#----------------------------------------------------------------------------------------------
train[is.na(train)] <- -300
test[is.na(test)] <- -300
#----------------------------
# Selecting the id variable.
#----------------------------
test_id <- test$id
train_id <- train$id
#-------------------------------------------
# Dropping the ID variable before training.
#-------------------------------------------
train <- train[, 2:ncol(train)]
test <- test[, 2:ncol(test)]
#-----------------------
# Selecting the target.
#-----------------------
target <- labels$x
#------------------------
# Transforming the sets.
#------------------------
train <- data.matrix(train)
test <- data.matrix(test)
#-------------------------------------
# A vector to contain the predictions.
#-------------------------------------
zeros <- rep(0, 39420)
#--------------------------------------------
# I will fit 50 models.
# The predictions are averaged out.
# So this is simply an ensemble of boosters.
#--------------------------------------------
control <- 50
for (i in 1:control){
bst <- xgboost(data = train,
label = target,
eta = 0.1,
max_depth = 6,
subsample = 0.5,
colsample_bytree = 1,
nrounds = 400,
objective = "reg:linear",
eval_metric = "rmse",
maximize = FALSE)
yhat <- predict(bst,test)
zeros <- zeros + yhat
}
zeros <- zeros/control
#---------------------------------------------
# Creating a submission frame.
# Renaming columns and dealing with the bias.
#---------------------------------------------
submission <- data.frame(test_id, zeros, stringsAsFactors = FALSE)
colnames(submission) <- c("ID", "Footfall")
submission$Footfall <- submission$Footfall*(0.987)
#------------------------------
# Dumping the submission file.
#------------------------------
write.csv(submission, file = "final_prediction.csv", row.names = FALSE)