Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

draft of sliding validation #362

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions cybench/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,10 @@
# Static predictors. Add more when available
STATIC_PREDICTORS = SOIL_PROPERTIES

# Yield trend
TREND_WINDOW = 5
YIELD_TREND_FEATURES = [KEY_TARGET + "-" + str(i) for i in range(1, TREND_WINDOW + 1)]

# Weather indicators
METEO_INDICATORS = ["tmin", "tmax", "tavg", "prec", "cwb", "rad"]

Expand Down Expand Up @@ -159,7 +163,7 @@
}

# All predictors. Add more when available
ALL_PREDICTORS = STATIC_PREDICTORS + TIME_SERIES_PREDICTORS
ALL_PREDICTORS = STATIC_PREDICTORS + YIELD_TREND_FEATURES + TIME_SERIES_PREDICTORS

# Crop calendar entries: start of season, end of season.
# doy = day of year (1 to 366).
Expand All @@ -183,7 +187,7 @@
# Lead time for forecasting
# Choices: "middle-of-season", "quarter-of-season",
# "n-day(s)" where n is an integer
FORECAST_LEAD_TIME = "middle-of-season"
FORECAST_LEAD_TIME = "60-days"

# Buffer period before the start of season
SPINUP_DAYS = 90
Expand Down
15 changes: 13 additions & 2 deletions cybench/datasets/configured.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
align_inputs_and_labels,
)

from cybench.util.data import get_trend_features


def _load_and_preprocess_time_series_data(
crop, country_code, ts_input, index_cols, ts_cols, df_crop_cal
Expand Down Expand Up @@ -74,19 +76,28 @@ def load_dfs(crop: str, country_code: str) -> tuple:
df_y = df_y[[KEY_LOC, KEY_YEAR, KEY_TARGET]]
df_y = df_y.dropna(axis=0)
df_y = df_y[df_y[KEY_TARGET] > 0.0]
df_y.set_index([KEY_LOC, KEY_YEAR], inplace=True)
# check empty targets
if df_y.empty:
return df_y, {}

# yield trend
df_y_trend = get_trend_features(df_y, 5)
df_y_trend.set_index([KEY_LOC, KEY_YEAR], inplace=True)

# set index of df_y
df_y.set_index([KEY_LOC, KEY_YEAR], inplace=True)

# soil
df_x_soil = pd.read_csv(
os.path.join(path_data_cn, "_".join(["soil", crop, country_code]) + ".csv"),
header=0,
)
df_x_soil = df_x_soil[[KEY_LOC] + SOIL_PROPERTIES]
df_x_soil.set_index([KEY_LOC], inplace=True)
dfs_x = {"soil": df_x_soil}
dfs_x = {
"soil": df_x_soil,
"yield_trend" : df_y_trend
}

# crop calendar
df_crop_cal = pd.read_csv(
Expand Down
11 changes: 7 additions & 4 deletions cybench/models/nn_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from cybench.config import (
KEY_TARGET,
STATIC_PREDICTORS,
YIELD_TREND_FEATURES,
TIME_SERIES_PREDICTORS,
ALL_PREDICTORS,
)
Expand All @@ -32,9 +33,10 @@ def separate_ts_static_inputs(batch: dict) -> tuple:
A tuple of torch tensors for time series and static inputs
"""
ts = torch.cat([batch[k].unsqueeze(2) for k in TIME_SERIES_PREDICTORS], dim=2)
trend = torch.cat([batch[k].unsqueeze(1) for k in YIELD_TREND_FEATURES], dim=1)
static = torch.cat([batch[k].unsqueeze(1) for k in STATIC_PREDICTORS], dim=1)

return ts, static
return ts, trend, static


class BaseNNModel(BaseModel, nn.Module):
Expand Down Expand Up @@ -602,6 +604,7 @@ def __init__(
):
# Add all arguments to init_args to enable model reconstruction in fit method
n_ts_inputs = len(TIME_SERIES_PREDICTORS)
n_trend_features = len(YIELD_TREND_FEATURES)
n_static_inputs = len(STATIC_PREDICTORS)
if not time_series_have_same_length:
kwargs["interpolate_time_series"] = True
Expand All @@ -613,7 +616,7 @@ def __init__(

super().__init__(**kwargs)
self._lstm = nn.LSTM(n_ts_inputs, hidden_size, num_layers, batch_first=True)
self._fc = nn.Linear(hidden_size + n_static_inputs, output_size)
self._fc = nn.Linear(hidden_size + + n_trend_features + n_static_inputs, output_size)

def fit(
self,
Expand Down Expand Up @@ -661,9 +664,9 @@ def fit(
)

def forward(self, x):
x_ts, x_static = separate_ts_static_inputs(x)
x_ts, x_trend, x_static = separate_ts_static_inputs(x)
x_ts, _ = self._lstm(x_ts)
x = torch.cat([x_ts[:, -1, :], x_static], dim=1)
x = torch.cat([x_ts[:, -1, :], x_trend, x_static], dim=1)
output = self._fc(x)
return output

Expand Down
33 changes: 33 additions & 0 deletions cybench/models/sklearn_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from collections.abc import Iterable
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import GridSearchCV, GroupKFold
Expand All @@ -23,6 +24,7 @@
KEY_TARGET,
KEY_DATES,
SOIL_PROPERTIES,
YIELD_TREND_FEATURES,
TIME_SERIES_INPUTS,
)

Expand Down Expand Up @@ -212,6 +214,9 @@ def _design_features(self, crop: str, data_items: Iterable):
dfs_x[x] = df_ts.reset_index()

features = design_features(crop, dfs_x)
trend_features = data_to_pandas(data_items,
data_cols=[KEY_LOC, KEY_YEAR] + YIELD_TREND_FEATURES)
features = features.merge(trend_features, on=[KEY_LOC, KEY_YEAR])

return features

Expand Down Expand Up @@ -359,3 +364,31 @@ def fit(self, train_dataset: Dataset, **fit_params):
}

super().fit(train_dataset, **fit_params)

class SklearnKNN(BaseSklearnModel):
def __init__(self, feature_cols: list = None):
knn = KNeighborsRegressor(weights="distance")

kwargs = {
"feature_cols": feature_cols,
"estimator": knn,
}

super().__init__(**kwargs)

def fit(self, train_dataset: Dataset, **fit_params):
"""Fit or train the model.

Args:
train_dataset (Dataset): training dataset
**fit_params: Additional parameters.

Returns:
A tuple containing the fitted model and a dict with additional information.
"""
fit_params["optimize_hyperparameters"] = True
fit_params["param_space"] = {
"estimator__n_neighbors": [3, 5, 7, 9],
}

super().fit(train_dataset, **fit_params)
135 changes: 69 additions & 66 deletions cybench/runs/run_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,17 @@
from cybench.evaluation.eval import evaluate_predictions
from cybench.models.naive_models import AverageYieldModel
from cybench.models.trend_models import TrendModel
from cybench.models.sklearn_models import SklearnRidge, SklearnRandomForest
from cybench.models.sklearn_models import (
SklearnRidge,
SklearnRandomForest,
SklearnKNN,
)

from cybench.models.nn_models import (
BaselineLSTM,
BaselineInceptionTime,
BaselineTransformer,
)
from cybench.util.features import dekad_from_date

from cybench.models.residual_models import (
RidgeRes,
Expand All @@ -43,6 +47,7 @@
"RidgeRes": RidgeRes,
"SklearnRF": SklearnRandomForest,
"RFRes": RandomForestRes,
"SklearnKNN": SklearnKNN,
"LSTM": BaselineLSTM,
"LSTMRes": LSTMRes,
"InceptionTime": BaselineInceptionTime,
Expand Down Expand Up @@ -157,37 +162,44 @@ def run_benchmark(
else:
sel_years = all_years

for test_year in sel_years:
train_years = [y for y in all_years if y != test_year]
test_years = [test_year]
train_dataset, test_dataset = dataset.split_on_years((train_years, test_years))

# TODO: put into generic function
models_init_kwargs["Transformer"] = {
"seq_len": train_dataset.max_season_window_length,
}
models_init_kwargs["TransformerRes"] = {
"seq_len": train_dataset.max_season_window_length,
}

labels = test_dataset.targets()

model_output = {
KEY_LOC: [loc_id for loc_id, _ in test_dataset.indices()],
KEY_YEAR: [year for _, year in test_dataset.indices()],
KEY_TARGET: labels,
}

for model_name, model_constructor in model_constructors.items():
model = model_constructor(**models_init_kwargs[model_name])
model.fit(train_dataset, **models_fit_kwargs[model_name])
predictions, _ = model.predict(test_dataset)
model_output[model_name] = predictions

df = pd.DataFrame.from_dict(model_output)
df[KEY_COUNTRY] = df[KEY_LOC].str[:2]
df.set_index([KEY_COUNTRY, KEY_LOC, KEY_YEAR], inplace=True)
df.to_csv(os.path.join(path_results, f"{dataset_name}_year_{test_year}.csv"))
# NOTE The test set is different per dataset in MLBaseline.
# Examples:
# Wheat, NL: 2012-2018
# Wheat or Maize, ES: 2012-2016
# Wheat or Maize, FR: 2010-2018
test_years = sorted([yr for yr in all_years if (yr >= 2012) and (yr <= 2016)])
train_years = sorted([yr for yr in all_years if yr < 2012])
if len(train_years) == 0 or len(test_years) == 0:
return {"df_metrics" : None}

train_dataset, test_dataset = dataset.split_on_years((train_years, test_years))

# TODO: put into generic function
models_init_kwargs["Transformer"] = {
"seq_len": train_dataset.max_season_window_length,
}
models_init_kwargs["TransformerRes"] = {
"seq_len": train_dataset.max_season_window_length,
}

labels = test_dataset.targets()

model_output = {
KEY_LOC: [loc_id for loc_id, _ in test_dataset.indices()],
KEY_YEAR: [year for _, year in test_dataset.indices()],
KEY_TARGET: labels,
}

for model_name, model_constructor in model_constructors.items():
model = model_constructor(**models_init_kwargs[model_name])
model.fit(train_dataset, **models_fit_kwargs[model_name])
predictions, _ = model.predict(test_dataset)
model_output[model_name] = predictions

df = pd.DataFrame.from_dict(model_output)
df[KEY_COUNTRY] = df[KEY_LOC].str[:2]
df.set_index([KEY_COUNTRY, KEY_LOC, KEY_YEAR], inplace=True)
df.to_csv(os.path.join(path_results, f"{dataset_name}.csv"))

df_metrics = compute_metrics(run_name, list(model_constructors.keys()))

Expand Down Expand Up @@ -274,32 +286,28 @@ def compute_metrics(
country_codes = df_all[KEY_COUNTRY].unique()
for cn in country_codes:
df_cn = df_all[df_all[KEY_COUNTRY] == cn]
all_years = sorted(df_cn[KEY_YEAR].unique())
for yr in all_years:
df_yr = df_cn[df_cn[KEY_YEAR] == yr]
y_true = df_yr[KEY_TARGET].values
if model_names is None:
model_names = [
c
for c in df_yr.columns
if c not in [KEY_COUNTRY, KEY_LOC, KEY_YEAR, KEY_TARGET]
]

for model_name in model_names:
metrics = evaluate_predictions(y_true, df_yr[model_name].values)
metrics_row = {
KEY_COUNTRY: cn,
"model": model_name,
KEY_YEAR: yr,
}

for metric_name, value in metrics.items():
metrics_row[metric_name] = value

rows.append(metrics_row)
y_true = df_cn[KEY_TARGET].values
if model_names is None:
model_names = [
c
for c in df_cn.columns
if c not in [KEY_COUNTRY, KEY_LOC, KEY_YEAR, KEY_TARGET]
]

for model_name in model_names:
metrics = evaluate_predictions(y_true, df_cn[model_name].values)
metrics_row = {
KEY_COUNTRY: cn,
"model": model_name,
}

for metric_name, value in metrics.items():
metrics_row[metric_name] = value

rows.append(metrics_row)

df_all = pd.DataFrame(rows)
df_all.set_index([KEY_COUNTRY, "model", KEY_YEAR], inplace=True)
df_all.set_index([KEY_COUNTRY, "model"], inplace=True)

return df_all

Expand Down Expand Up @@ -337,12 +345,12 @@ def run_benchmark_on_all_data():
"AverageYieldModel",
"LinearTrend",
"SklearnRidge",
"RidgeRes",
"SklearnRF",
"SklearnKNN",
"LSTM",
"LSTMRes",
]
# override epochs for nn-models
nn_models_epochs = 5
nn_models_epochs = 50
results = run_benchmark(
run_name=run_name,
dataset_name=dataset_name,
Expand All @@ -352,9 +360,4 @@ def run_benchmark_on_all_data():
else:
results = run_benchmark(run_name=run_name, dataset_name=dataset_name)

df_metrics = results["df_metrics"].reset_index()
print(
df_metrics.groupby("model").agg(
{"normalized_rmse": "mean", "mape": "mean", "r2": "mean"}
)
)
print(results["df_metrics"].head())
11 changes: 11 additions & 0 deletions cybench/util/data.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import pandas as pd
import numpy as np

from cybench.config import KEY_LOC, KEY_YEAR, KEY_TARGET

def data_to_pandas(data_items, data_cols=None):
"""Convert data items as dict to pandas DataFrame
Expand Down Expand Up @@ -45,3 +46,13 @@ def trim_time_series_data(sample: dict, num_time_steps: int, time_series_keys: l
)

return sample

def get_trend_features(df, trend_window):
trend_fts = df.sort_values(by=[KEY_LOC, KEY_YEAR])
for i in range(trend_window, 0, -1):
trend_fts[KEY_TARGET + "-" + str(i)] = trend_fts.groupby([KEY_LOC])[
KEY_TARGET
].shift(i)

trend_fts = trend_fts.dropna(axis=0).drop(columns=[KEY_TARGET])
return trend_fts
Loading