-
Notifications
You must be signed in to change notification settings - Fork 41
/
stackingClicks.R
232 lines (156 loc) · 6.82 KB
/
stackingClicks.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
# This script will help you get started with building stacked models in R.
# Although, there are libraries like caret, H2o where stacking happens automatically and many of us never know what happens at back.
# This script will help you understand how stacking happens and will provide your more control of the process.
# This script builds 5 XGB Models in L1 and finally a Logistic Regression model on top of them.
path <- "add path"
setwd(path)
library(data.table)
library(lubridate)
library(caret)
library(xgboost)
train <- fread("train.csv",na.strings = c(" ","",NA,"NA"))
test <- fread("test.csv",na.strings = c(" ","",NA,"NA"))
# Data Pre-Processing -----------------------------------------------------
# Impute Missing values
colSums(is.na(train))
train[is.na(siteid), siteid := -999]
test[is.na(siteid), siteid := -999]
train[is.na(browserid), browserid := "None"]
test[is.na(browserid), browserid := "None"]
train[is.na(devid), devid := "None"]
test[is.na(devid), devid := "None"]
# Create Date Features
train[,datetime := as.Date(datetime, format = "%Y-%m-%d %H:%M:%S")]
test[,datetime := as.Date(datetime, format = "%Y-%m-%d %H:%M:%S")]
train[,tweekday := weekdays(datetime)]
test[,tweekday := weekdays(datetime)]
train[,thour := hour(datetime)]
test[,thour := hour(datetime)]
train[,tminute := minute(datetime)]
test[,tminute := minute(datetime)]
# Label Encoding
feats <- setdiff(colnames(train), c("ID"))
for ( f in feats)
{
if (class(train[[f]]) == 'character')
{
levels <- unique(c(train[[f]], test[[f]]))
train[[f]] <- as.numeric(factor(train[[f]], levels = levels))
test[[f]] <- as.numeric(factor(test[[f]], levels = levels))
}
}
# Some Aggregate Features
train[,agg1 := .N, .(siteid, offerid)]
test[,agg1 := .N, .(siteid, offerid)]
train[,agg2 := .N, .(siteid, category)]
test[,agg2 := .N, .(siteid, category)]
train[,agg3 := .N, .(siteid, merchant)]
test[,agg3 := .N, .(siteid, merchant)]
# To avoid memory troubles, lets train the model on 10% of the data - you can always increase it
trainX <- train[sample(.N, 1e6)]
# Stacking ----------------------------------------------------------------
#The following script works this way:
# 1. For every seed and a different nround value, it creates a prediction for train and test
# 2. All the train and test predictions are stored in oof_train and of_test respectively.
# 3. Finally, a logistic regression model is trained on L1 predictions.
# 4. For understanding, comments are added to help you understand what's going on.
folds <- createFolds(trainX$click, k = 10, list = T)
#set seeds
coseed <- c(1,13,999,10001,9009)
num_rounds <- c(50,100,150,200,250)
params <- list(
objective = 'binary:logistic',
base_score = mean(trainX$click == 0),
max_depth = 10,
eta = 0.03,
subsample = 0.5,
colsample_bytree = 0.5,
min_child_weight = 1,
eval_metric = "auc"
)
# create data frame to store predictions
oof_train <- data.frame(ID = numeric(), actual = numeric(), predict = numeric(), seed = numeric())
oof_test <- data.frame(ID = test$ID)
# Start Stacking
for (i in seq(coseed))
{
cat('Building on seed: ',coseed[i],'\n')
feature.names <- setdiff(colnames(trainX), c('ID','datetime','click'))
for(j in seq(folds))
{
cat('\n')
cat(sprintf('Training on Fold: %d ',j),'\n')
idex <- folds[[j]]
X_train <- trainX[!idex]
X_valid <- trainX[idex]
cat(sprintf("Training model on: %d rows ", nrow(X_train)),'\n')
cat(sprintf("Testing model on: %d rows", nrow(X_valid)),'\n \n')
dtrain <- xgb.DMatrix(data = as.matrix(X_train[,feature.names,with=F]), label = X_train$click)
dvalid <- xgb.DMatrix(data = as.matrix(X_valid[,feature.names,with=F]), label = X_valid$click)
watchlist <- list(train = dtrain, valid = dvalid)
bst <- xgb.train(params = params
,data = dtrain
,nrounds = as.integer(num_rounds[i])
,print_every_n = 20
,watchlist = watchlist
,early_stopping_rounds = 40
,maximize = T
,nthreads = -1)
preds <- predict(bst, dvalid)
df = data.frame(ID = X_valid$ID, actual = X_valid$click, predict = preds, seed = i)
oof_train <- rbind(oof_train, df)
#cat(sprintf("Now, oof_train has %d rows and %d columns",nrow(oof_train), ncol(oof_train)),'\n')
rm(bst, dtrain, dvalid, preds,df)
gc()
}
cat('\n')
cat(sprintf("After %s seed, oof_train has %s rows and %s columns",i,nrow(oof_train),ncol(oof_train)),'\n \n')
cat("Now Training on full data........",'\n')
dtest <- xgb.DMatrix(data = as.matrix(test[,feature.names,with=F]))
dtrain <- xgb.DMatrix(data = as.matrix(trainX[,feature.names,with=F]), label = trainX$click)
print ('Training model...')
bst <- xgb.train(params = params
,data = dtrain
,nrounds = as.integer(num_rounds[i])
,print_every_n = 20
,maximize = T
,nthreads = -1)
print('predicting....')
preds <- predict(bst, dtest)
oof_test[[paste0("pred_seed_",i)]] <- preds
cat(sprintf("oof_test has %s rows and %s columns",nrow(oof_test),ncol(oof_test)),'\n \n')
rm(df, bst, preds)
}
# Now creating Level 1 Data -----------------------------------------------
oof_train$seed <- as.factor(oof_train$seed)
mastertrain <- split(oof_train, oof_train$seed)
for(x in seq(mastertrain))
{
assign(paste0("oof_train_",x), data.table(mastertrain[[x]]))
}
setnames(oof_train_1,"predict","pred_seed_1")
setnames(oof_train_2,"predict","pred_seed_2")
setnames(oof_train_3,"predict","pred_seed_3")
setnames(oof_train_4,"predict","pred_seed_4")
setnames(oof_train_5,"predict","pred_seed_5")
mastertrain <- merge(x = oof_train_1, y = oof_train_2[,.(ID, pred_seed_2)], by = 'ID', all.x = T)
mastertrain <- merge(x = mastertrain, y = oof_train_3[,.(ID, pred_seed_3)], by = 'ID', all.x = T)
mastertrain <- merge(x = mastertrain, y = oof_train_4[,.(ID, pred_seed_4)], by = 'ID', all.x = T)
mastertrain <- merge(x = mastertrain, y = oof_train_5[,.(ID, pred_seed_5)], by = 'ID', all.x = T)
head(mastertrain)
rm(oof_train, oof_train_1, oof_train_2, oof_train_3, oof_train_4, oof_train_5,trainX,X_train,X_valid)
gc()
setnames(mastertrain,"actual","click")
# Training Model ----------------------------------------------------------
# Logistic Regression
mastertrain[,click := as.factor(click)]
lr_model <- glm(click ~ ., data = mastertrain[,-c('ID','seed'),with=F], family = binomial(link = 'logit'),maxit = 500)
summary(lr_model)
preds <- predict(lr_model, oof_test, type = 'response')
submit <- data.table(ID = oof_test$ID, click = preds)
fwrite(submit, "stacked_lgr.csv")
# What can you do next ?
# 1. Train on more data
# 2. Try to create more features
# 3. Make L1 predictions more diverse.
# 3. Try using boosting L1 predictions.