-
Notifications
You must be signed in to change notification settings - Fork 12
/
Copy pathgradient_boosting_decision_tree.py
76 lines (58 loc) · 2.4 KB
/
gradient_boosting_decision_tree.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import numpy as np
from sklearn.datasets import fetch_california_housing
from decision_tree import DecisionTree
# TODO classification
def squared_loss(y, pred):
return np.square(pred - y).mean() / 2
def squared_loss_gradient(y, pred):
return pred - y
def absolute_loss_gradient(y, pred):
return np.sign(pred - y)
class GBDT(object):
def __init__(self, regression=True, tree_num=20, max_depth=4):
self.regression = regression
self.max_depth = max_depth
self.tree_num = tree_num
self.forest = []
self.rhos = np.ones(self.tree_num)
self.t0 = 0
self.shrinkage = 0.5
def get_importance(self):
return sum(tree.get_importance() for tree in self.forest) / self.tree_num
def _linear_search(self, y, pred, delta):
step = 0.1
rhos = np.arange(step, 10, step)
losses = [squared_loss(y, pred - rho * delta) for rho in rhos]
return rhos[np.argmin(losses)]
def fit(self, x, y):
self.t0 = y.mean() # t0, which is a constant
pred = y.mean()
for i in range(self.tree_num):
grad = squared_loss_gradient(y, pred)
self.forest.append(DecisionTree(
metric_type="Variance", depth=self.max_depth, regression=True))
self.forest[i].fit(x, grad)
delta = self.forest[i].predict(x)
# find best learning rate
self.rhos[i] = self._linear_search(y, pred, delta)
pred -= self.shrinkage * delta * self.rhos[i]
# for categorical dataset, use cross entropy loss
print("tree {} constructed, rho {}, loss {}".format(
i, self.rhos[i], squared_loss(y, pred)))
def predict(self, x):
return self.t0 - np.array([tree.predict(x) * rho * self.shrinkage for tree, rho in zip(self.forest, self.rhos)]).sum(axis=0)
def main():
data = fetch_california_housing(data_home='data')
test_ratio = 0.2
test_split = np.random.uniform(0, 1, len(data.data))
train_x = data.data[test_split >= test_ratio]
test_x = data.data[test_split < test_ratio]
train_y = data.target[test_split >= test_ratio]
test_y = data.target[test_split < test_ratio]
gbdt = GBDT()
gbdt.fit(train_x, train_y)
print(gbdt.get_importance())
print(squared_loss(train_y, gbdt.predict(train_x)))
print(squared_loss(test_y, gbdt.predict(test_x)))
if __name__ == "__main__":
main()