-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathbenchmarks.py
177 lines (147 loc) · 4.9 KB
/
benchmarks.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
# Standard and GIS Modules
import os
import sys
import numpy as np
import pandas as pd
import time
# ignore linalg warnings from MGWR package
import warnings
warnings.filterwarnings("ignore")
# gwr:
from mgwr.gwr import GWR
from mgwr.sel_bw import Sel_BW
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sprf.spatial_random_forest import SpatialRandomForest
from sprf.geographical_random_forest import GeographicalRandomForest
from models import *
def get_folds(nr_samples, nr_folds=10):
fold_inds = np.random.permutation(nr_samples)
num_per_fold = nr_samples // nr_folds
train_inds, test_inds = [], []
for i in range(nr_folds):
# print("start, end", i*num_per_fold)
if i < nr_folds - 1:
test_inds_fold = np.arange(
i * num_per_fold, (i + 1) * num_per_fold, 1
)
else:
test_inds_fold = np.arange(i * num_per_fold, nr_samples)
test_inds.append(fold_inds[test_inds_fold])
train_inds.append(np.delete(fold_inds, test_inds_fold))
return train_inds, test_inds
def prepare_data(data, target, lon="x", lat="y"):
"""Assumes that all other columns are used as covariates"""
# covariates = [col for col in data.columns if col not in [lon, lat, target]]
# return data[covariates], data[target], data[[lon, lat]]
return data.rename(
columns={target: "label", lon: "x_coord", lat: "y_coord"}
)
def add_metrics(test_pred, test_y, res_dict_init, method, runtime):
res_dict = res_dict_init.copy()
res_dict["Method"] = method
res_dict["RMSE"] = mean_squared_error(test_pred, test_y, squared=False)
res_dict["MAE"] = mean_absolute_error(test_pred, test_y)
res_dict["R-Squared"] = r2_score(test_y, test_pred)
res_dict["Runtime"] = runtime
return res_dict
def cross_validation(data):
nr_folds = 5
train_inds, test_inds = get_folds(len(data), nr_folds=nr_folds)
res_df = []
# dataset specific information
target = dataset_target[DATASET]
x_coord_name = dataset_x.get(DATASET, "x")
y_coord_name = dataset_y.get(DATASET, "y")
# model params --> TODO: grid search
max_depth = 10
spatial_neighbors = len(data) // 5 # one fifth of the dataset
print("Number of neighbors considered for spatial RF:", spatial_neighbors)
data_renamed = prepare_data(data.copy(), target, x_coord_name, y_coord_name)
for fold in range(nr_folds):
res_dict_init = {"fold": fold, "max_depth": max_depth}
train_data = data_renamed.iloc[train_inds[fold]]
test_data = data_renamed.iloc[test_inds[fold]]
feat_cols = [
col
for col in train_data.columns
if "coord" not in col and col != "label"
]
# print(
# train_x.shape, train_y.shape, train_coords.shape, test_x.shape,
# test_y.shape, test_coords.shape
# )
for model_function, name in zip(model_function_names, model_names):
tic = time.time()
test_pred = model_function(
train_data.copy(), test_data.copy(), feat_cols=feat_cols,
)
runtime = time.time() - tic
res_df.append(
add_metrics(
test_pred, test_data["label"], res_dict_init, name, runtime,
)
)
print(name, res_df[-1]["R-Squared"])
# Finalize results
res_df = pd.DataFrame(res_df)
return res_df
os.makedirs("outputs", exist_ok=True)
dataset_target = {
"plants": "richness_species_vascular",
"meuse": "zinc",
"atlantic": "Rate",
"deforestation": "deforestation_quantile",
"california_housing": "median_house_value",
}
model_function_names = [
linear_regression,
rf_coordinates,
rf_global,
rf_spatial,
my_gwr,
kriging,
sarm,
slx
# rf_geographical,
]
model_names = [
"linear regression",
"RF (coordinates)",
"RF",
"spatial RF",
"GWR",
"Kriging",
"SAR",
"SLX"
# "geographical RF",
]
datasets = [
"meuse",
"plants",
"atlantic",
"deforestation",
"california_housing",
]
np.random.seed(42)
for DATASET in datasets:
print("\nDATASET", DATASET, "\n")
dataset_x = {} # per default: x
dataset_y = {} # per default: y
data_path = os.path.join("data", DATASET + ".csv")
data = pd.read_csv(data_path)
print("Number of samples", len(data))
results = cross_validation(data)
results.to_csv(
os.path.join("outputs", f"results_{DATASET}_folds.csv"), index=False
)
results_grouped = (
results.groupby(["Method"])
.mean()
.drop(["fold", "max_depth"], axis=1)
.sort_values("RMSE")
)
results_grouped.to_csv(os.path.join("outputs", f"results_{DATASET}.csv"))
print(results_grouped)
print("--------------")