-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathstacking.py
199 lines (168 loc) · 6.55 KB
/
stacking.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
# -*- coding: utf-8 -*-
import numpy as np
from standard_template import *
from external_template import gbdtMethod
from sklearn import linear_model
from sklearn.externals import joblib
from sklearn.cross_validation import train_test_split
selectMethod = {'svm': 1 , 'lr': 1 , 'nb': 1 , 'rf': 0 , 'gbdt': 0 }
class stacking(svmMethod,lrMethod,nbMethod,rfMethod):
def __init__(self):
self.test_size = 0.3
self.random_state = 48
def establishStage2Model(self):
logreg = linear_model.LogisticRegression(penalty='l2', dual = False, C = 50, solver='newton-cg',max_iter=1000)
return logreg
def stackTrainTestSplit(self,dataPath,labelPath):
data,label = dataPreprocessing.dataStackPort(self,dataPath,labelPath)
dataTrain, dataTest, labelTrain, labelTest = train_test_split(data, label, test_size = self.test_size, random_state = self.random_state)
return dataTrain, dataTest, labelTrain, labelTest
pass
def stackStage2TestProcess(self,data):
model = joblib.load( 'stage2model' )
probModel = model.predict_proba(data)
TopPreds = super().TopPredict(probModel)
return TopPreds,probModel
pass
def stackStage2TrainProcess(self,data,label):
print('start stage2 training')
model = self.establishStage2Model()
model.fit(data,label)
joblib.dump(model, 'stage2model' )
pass
def stackTestProcess(self,data):
'''
调用每个基类的初始化函数
'''
probTestSet = []
svmMethod.__init__(self)
svmResult = svmMethod.stackingTestPort(self,data)
probTestSet.append(svmResult)
lrMethod.__init__(self)
lrResult = lrMethod.stackingTestPort(self,data)
probTestSet.append(lrResult)
nbMethod.__init__(self)
nbResult = nbMethod.stackingTestPort(self,data)
probTestSet.append(nbResult)
rfMethod.__init__(self)
rfResult = rfMethod.stackingTestPort(self,data)
probTestSet.append(rfResult)
prob = self.stackingPlan(probTestSet)
return prob
pass
def stackTrainProcess(self,data,label):
'''
调用每个基类的初始化函数
'''
svmMethod.__init__(self)
svmMethod.stackingTrainPort(self,data,label)
lrMethod.__init__(self)
lrMethod.stackingTrainPort(self,data,label)
nbMethod.__init__(self)
nbMethod.stackingTrainPort(self,data,label)
rfMethod.__init__(self)
rfMethod.stackingTrainPort(self,data,label)
pass
def stackingPlan(self,probStackSet):
'''
example:
-------------------------------------------------------------
base learner -- svm -- lr -- nb -- rf -- gbdt --
probability prob 1 ,, prob 2 ,, prob 3 ,, prob 4 ,, prob 5
-------------------------------------------------------------
'''
'''
average plan
prob = (prob 1 + ... + prob N) / N
'''
''' PLAN 1
index = 1
prob = 0
for probSet in probStackSet:
probSet = np.array(probSet)
prob = prob + probSet
index = index + 1
return prob / index
'''
return probStackSet[1]
pass
def stackStage1FeatureAndLabelProcess(self,data,label):
'''
调用每个基类的初始化函数
'''
probStackSet = []
svmMethod.__init__(self)
'''
为了模块间的松耦合性 将 stack 的单模型结果 写成 list append 型
'''
'''
svm result
'''
probSet,labelSet = svmMethod.stackingStage1FeatureAndLabel(self,data,label)
probStackSet.append(probSet)
'''
lr result
'''
lrMethod.__init__(self)
probSet,labelSet = lrMethod.stackingStage1FeatureAndLabel(self,data,label)
probStackSet.append(probSet)
'''
nb result
'''
nbMethod.__init__(self)
probSet,labelSet = nbMethod.stackingStage1FeatureAndLabel(self,data,label)
probStackSet.append(probSet)
'''
rf result
'''
rfMethod.__init__(self)
probSet,labelSet = rfMethod.stackingStage1FeatureAndLabel(self,data,label)
probStackSet.append(probSet)
return probStackSet, labelSet
pass
def stackStage1FeatureAndLabel(self,data,label):
#data,label = dataPreprocessing.dataStackPort(self,data,label)
probStackSet,labelSet = self.stackStage1FeatureAndLabelProcess(data,label)
'''
execute stack plan
'''
probStackPlanResult = self.stackingPlan(probStackSet)
return probStackPlanResult,labelSet
def stackStage1Train(self,dataPath,labelPath):
try :
data,label = dataPreprocessing.dataStackPort(self,dataPath,labelPath)
except ValueError:
data = dataPath
label = labelPath
self.stackTrainProcess(data, label)
pass
def stackStage1Test(self,dataPath):
data = dataPreprocessing.dataTestPort(self,dataPath)
return self.stackTestProcess(data)
def stackStage2Train(self,dataPath,labelPath):
probStackPlanResult,labelSet = self.stackStage1FeatureAndLabel(dataPath,labelPath)
'''
probStackPlanResult : stage 2 train dataset
labelSet : stage 2 train label
'''
self.stackStage2TrainProcess(probStackPlanResult,labelSet)
pass
def stackStage2Test(self,dataPath):
prob = self.stackStage1Test(dataPath)
TopPredict = self.stackStage2TestProcess(prob)
return TopPredict
pass
#def stackEval(self,dataPath)
if __name__ == "__main__":
dataPath = 'D:/2017_8_8_task/20170808/SAR/task_classification/comp_dataset/fc7.txt'
labelPath = 'D:/2017_8_8_task/test/Stacking_test/labelinfo.txt'
''' 单元测试 '''
model = stacking()
''' 第一阶段训练 单元测试 '''
model.stackStage1Train(dataPath,labelPath)
''' 第二阶段训练 单元测试 '''
model.stackStage2Train(dataPath,labelPath)
''' 第一阶段 和 第二阶段 单元测试'''
TopPredict = model.stackStage2Test(dataPath)
''' 获取 第一阶段 得到的 feature 和 label '''
probStackPlanResult,labelSet = model.stackStage1FeatureAndLabel(dataPath,labelPath)