-
Notifications
You must be signed in to change notification settings - Fork 9
/
model_ama_meta_blender.py
164 lines (127 loc) · 6.08 KB
/
model_ama_meta_blender.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
# -*- coding: utf-8 -*-
"""
Created on Mon Nov 06 16:24:41 2017
@author: mimar
"""
import numpy as np
from sklearn.metrics import roc_auc_score
from scipy.stats import pearsonr
import lightgbm as lgb
from sklearn.preprocessing import OneHotEncoder
import os
from sklearn.externals import joblib
from sklearn.cross_validation import StratifiedKFold
from sklearn.utils import shuffle
from sklearn.ensemble import ExtraTreesClassifier
import linearblender as lnbl
def bagged_set(X_ts,y_cs, seed, estimators, xt,yt=None):
# create array object to hold predictions
baggedpred=np.array([ 0.0 for d in range(0, xt.shape[0])])
#loop for as many times as we want bags
for n in range (0, estimators):
model= lnbl.linearblender(task="classification",
metric="auc",
n_classes=2,
n_iter=5,
precision="medium",
verbose=1)
model.fit(X_ts,y_cs)
preds=model.predict_proba(xt) [:,1]
# update bag's array
baggedpred+=preds
print("completed: " + str(n) )
# divide with number of bags to create an average estimate
baggedpred/= estimators
return baggedpred
#train = np.loadtxt('first_level_train.csv')
#test = np.loadtxt('first_level_test.csv')
#train=np.loadtxt('train.csv',usecols=[k for k in range(1,9)], skiprows=1, delimiter=",")
#test = np.loadtxt('test.csv',usecols=[k for k in range(1,9)], skiprows=1, delimiter=",")
meta_folder="meta_folder/"
if not os.path.exists(meta_folder): #if it does not exists, we create it
os.makedirs(meta_folder)
models=["lightgbm_script_v1",
"lightgbm_et_v1",
"lightgbm_logit_v1",
"lightgbm_rf_v1",
"lightgbm_script_v2",
"lightgbmreg_script_v1",
"lightgbmreg_script_v2",
"xg_script_v1",
"xg_script_v2",
"dai_preds",
"lightgbm_script_v1_counts",
"logit_v1_sparse"
]
#"lightgbm_script_v1_counts"]
bench_train=np.loadtxt(meta_folder+models[0]+ ".train.csv")
bench_test=np.loadtxt(meta_folder+models[0]+ ".test.csv")
y =np.loadtxt("train.csv", delimiter=',',usecols=[0], skiprows=1)
Xmetatrain=None
Xmetatest=None
for j in range (len(models)):
mini_xtrain=np.loadtxt(meta_folder+models[j]+ ".train.csv") # we load the held out prediction of the int'train.csv' model
mini_xtest=np.loadtxt(meta_folder+models[j]+ ".test.csv") # we load the test set prediction of the int'test.csv' model
mean_train=np.mean(mini_xtrain) # we calclaute the mean of the train set held out predictions for reconciliation purposes
mean_test=np.mean(mini_xtest) # we calclaute the mean of the test set predictions
cor_train=pearsonr(bench_train,mini_xtrain)[0]
cor_test=pearsonr(bench_test,mini_xtest)[0]
# we print the AUC and the means and we still hope that everything makes sense. Eg. the mean of the train set preds is 1232314.34 and the test is 0.7, there is something wrong...
print("model %d auc %f mean train/test %f/%f corr train/test %f/%f" % (j,roc_auc_score(np.array(y),mini_xtrain) ,mean_train,mean_test,cor_train,cor_test))
if type(Xmetatrain)==type(None):
Xmetatrain=mini_xtrain
Xmetatest=mini_xtest
else :
Xmetatrain=np.column_stack((Xmetatrain,mini_xtrain))
Xmetatest=np.column_stack((Xmetatest,mini_xtest))
X=Xmetatrain
X_test=Xmetatest
output_name="model_ama_meta_et_v1"
print(X.shape, y.shape)
############### Params section #####################
bagging=1 # number of models trained with different seeds
number_of_folds = 5 # number of folds in strattified cv
kfolder=StratifiedKFold(y, n_folds= number_of_folds,shuffle=True, random_state=1)
#model to use
#modelextra= ExtraTreesClassifier(n_estimators=1000, criterion='entropy', max_depth=40,
#min_samples_split=4,min_samples_leaf=2, max_features=0.7,n_jobs=25, random_state=1)
train_stacker=[ 0.0 for k in range (0,X.shape[0]) ]
#create target variable
mean_kapa = 0.0
kfolder=StratifiedKFold(y, n_folds=number_of_folds,shuffle=True, random_state=1)
#X,y=shuffle(X,y, random_state=SEED) # Shuffle since the data is ordered by time
i=0 # iterator counter
print ("starting cross validation with %d kfolds " % (number_of_folds))
for train_index, test_index in kfolder:
# creaning and validation sets
X_train, X_cv = X[train_index], X[test_index]
y_train, y_cv = np.array(y)[train_index], np.array(y)[test_index]
print (" train size: %d. test size: %d, cols: %d " % ((X_train.shape[0]) ,(X_cv.shape[0]) ,(X_train.shape[1]) ))
preds= bagged_set(X_train,y_train, 1, bagging, X_cv,yt=None )# y_cv
auc=roc_auc_score(y_cv, preds)
print ("size train: %d size cv: %d auc (fold %d/%d): %f" % ((X_train.shape[0]), (X_cv.shape[0]), i + 1, number_of_folds,auc))
mean_kapa += auc
#save the results
no=0
for real_index in test_index:
train_stacker[real_index]=(preds[no])
no+=1
i+=1 # increment cv iterator
if (number_of_folds)>0:
mean_kapa/=number_of_folds
print (" Average auc : %f" % (mean_kapa) )
np.savetxt(meta_folder+output_name+ ".train.csv",np.array(train_stacker), fmt='%.6f')
print (" printing train datasets ")
preds= bagged_set(X,y, 1, bagging, X_test,yt=None)
# === Predictions === #
print (" printing test datasets ")
np.savetxt(meta_folder+output_name+ ".test.csv",np.array(preds), fmt='%.6f')
print("Write results...")
output_file = "submission_"+str( (mean_kapa ))+".csv"
print("Writing submission to %s" % output_file)
f = open(output_file, "w")
f.write("id,action\n")# the header
for g in range(0, len(preds)) :
f.write("%d,%f\n" % (g+1,preds[g]))
f.close()
print("Done.")