-
Notifications
You must be signed in to change notification settings - Fork 1
/
benchmark.py
109 lines (93 loc) · 3.56 KB
/
benchmark.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import numpy as np
import pandas as pd
import scipy.stats
from sklearn.dummy import DummyRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from tqdm.auto import tqdm
from sklearn.model_selection import KFold, RandomizedSearchCV, train_test_split
from sklearn.metrics import mean_absolute_error, make_scorer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, ElasticNet
from lightgbm import LGBMRegressor
from regression_classifier import ClassRegressorEnsemble, ClassRegressorOnelevelEnsemble
def load_dataframe():
df = pd.read_csv('./data/housing.csv')
df = df.dropna().reset_index(drop=True)
df = pd.get_dummies(df, columns=['ocean_proximity'], prefix='ocean', drop_first=True)
return df
def run_benchmark(train_X, test_X, train_Y, test_Y, model, hparam_space, search_n_iter=30):
search = RandomizedSearchCV(model,
cv=KFold(n_splits=4),
param_distributions=hparam_space,
scoring='neg_mean_absolute_error',
verbose=8,
n_jobs=4,
n_iter=search_n_iter)
search.fit(train_X, train_Y)
pred_test = search.predict(test_X)
mae = mean_absolute_error(test_Y, pred_test)
benchmark_result = {
'best_params': search.best_params_,
'score': mae,
}
print(benchmark_result)
return benchmark_result
def run_benchmarks():
df = load_dataframe()
target_name = 'median_house_value'
X, y = df.drop(columns=[target_name]), df[target_name]
train_X, test_X, train_Y, test_Y = train_test_split(X, y)
pipelines = [
Pipeline([
('inputer', SimpleImputer()),
('scaler', StandardScaler()),
('model', ClassRegressorEnsemble()),
]),
Pipeline([
('inputer', SimpleImputer()),
('scaler', StandardScaler()),
('model', ClassRegressorOnelevelEnsemble()),
]),
Pipeline([
('inputer', SimpleImputer()),
('scaler', StandardScaler()),
('model', ElasticNet()),
]),
Pipeline([
('inputer', SimpleImputer()),
('scaler', StandardScaler()),
('model', LGBMRegressor()),
]),
]
hparam_spaces = [
{
'model__n_bins': [2, 5],
'model__n_levels': [2, 5, 10, 30],
'model__bin_calc_method': ['equal', 'percentile'],
'model__leaf_size': [10, 50, 100],
'model__leaf_model_cls': [DummyRegressor, LinearRegression],
},
{
'model__n_bins': [10, 20, 30],
'model__bin_calc_method': ['equal', 'percentile'],
'model__leaf_model_cls': [DummyRegressor, LinearRegression, None],
},
{
'model__alpha': scipy.stats.norm(0.5, 1),
'model__l1_ratio': scipy.stats.norm(0.5, 0.15),
},
{
'model__max_depth': np.arange(-1, 20, 2),
'model__subsample': np.arange(0.2, 1.2, 0.2),
'model__n_estimators': np.arange(10, 310, 40),
},
]
results = {}
for model, hparam_space in tqdm(zip(pipelines, hparam_spaces), total=len(pipelines)):
results[model.named_steps.model.__class__.__name__] = run_benchmark(train_X, test_X, train_Y, test_Y, model, hparam_space)
return results
if __name__ == '__main__':
results = run_benchmarks()
print(results)