Skip to content

Commit

Permalink
Add CV wrapper
Browse files Browse the repository at this point in the history
  • Loading branch information
ThomasMeissnerDS committed Jul 4, 2023
1 parent f356a6b commit 594aa74
Show file tree
Hide file tree
Showing 4 changed files with 161 additions and 3 deletions.
4 changes: 2 additions & 2 deletions bluecast/blueprints/cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,9 @@ class BlueCast:
:param :class_problem: Takes a string containing the class problem type. Either "binary" or "multiclass".
:param :target_column: Takes a string containing the name of the target column.
:param :cat_columns: Takes a list of strings containing the names of the categorical columns. If not provided,
BlueCast will infer these automaically.
BlueCast will infer these automatically.
:param :date_columns: Takes a list of strings containing the names of the date columns. If not provided,
BlueCast will infer these automaically.
BlueCast will infer these automatically.
:param :time_split_column: Takes a string containing the name of the time split column. If not provided,
BlueCast will not split the data by time or order, but do a random split instead.
:param :ml_model: Takes an instance of a XgboostModel class. If not provided, BlueCast will instantiate one.
Expand Down
119 changes: 119 additions & 0 deletions bluecast/blueprints/cast_cv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
from typing import Any, List, Literal, Optional, Tuple, Union

import pandas as pd
from sklearn.model_selection import StratifiedKFold

from bluecast.blueprints.cast import BlueCast
from bluecast.config.training_config import (
TrainingConfig,
XgboostFinalParamConfig,
XgboostTuneParamsConfig,
)


class BlueCastCV:
def __init__(
self,
class_problem: Literal["binary", "multiclass"] = "binary",
conf_training: Optional[TrainingConfig] = None,
conf_xgboost: Optional[XgboostTuneParamsConfig] = None,
conf_params_xgboost: Optional[XgboostFinalParamConfig] = None,
):
self.class_problem = class_problem
self.conf_training = conf_training
self.conf_xgboost = conf_xgboost
self.conf_params_xgboost = conf_params_xgboost
self.bluecast_models: List[BlueCast] = []

def prepare_data(
self, df: pd.DataFrame, target: str
) -> Tuple[pd.DataFrame, pd.Series]:
y = df[target]
X = df.drop(target, axis=1)
return X, y

def fit(self, df: pd.DataFrame, target: str, stratifier: Optional[Any]) -> None:
X, y = self.prepare_data(df, target)

if not self.conf_training:
self.conf_training = TrainingConfig()

if not stratifier:
stratifier = StratifiedKFold(
n_splits=5,
shuffle=True,
random_state=self.conf_training.global_random_state,
)

for fn, (trn_idx, val_idx) in enumerate(stratifier.split(X, y)):
X_train, X_val = X.iloc[trn_idx], X.iloc[val_idx]
y_train, y_val = y.iloc[trn_idx], y.iloc[val_idx]
x_train = pd.concat([X_train, X_val], axis=1)
y_train = pd.concat([y_train, y_val], axis=1)

x_train = x_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
x_train[target] = y_train[target]

self.conf_training.global_random_state += fn

automl = BlueCast(
class_problem=self.class_problem,
target_column=target,
conf_training=self.conf_training,
conf_xgboost=self.conf_xgboost,
conf_params_xgboost=self.conf_params_xgboost,
)
automl.fit(X_train, target_col=target)
self.bluecast_models.append(automl)

def fit_eval(
self, df: pd.DataFrame, target_col: str, stratifier: Optional[Any] = None
) -> None:
X, y = self.prepare_data(df, target_col)

if not self.conf_training:
self.conf_training = TrainingConfig()

if not stratifier:
stratifier = StratifiedKFold(
n_splits=5,
shuffle=True,
random_state=self.conf_training.global_random_state,
)

for fn, (trn_idx, val_idx) in enumerate(stratifier.split(X, y)):
X_train, X_val = X.iloc[trn_idx], X.iloc[val_idx]
y_train, y_val = y.iloc[trn_idx], y.iloc[val_idx]

X_train[target_col] = y_train

self.conf_training.global_random_state += fn

automl = BlueCast(
class_problem=self.class_problem,
target_column=target_col,
conf_training=self.conf_training,
conf_xgboost=self.conf_xgboost,
conf_params_xgboost=self.conf_params_xgboost,
)
automl.fit_eval(X_train, X_val, y_val, target_col=target_col)
self.bluecast_models.append(automl)

def predict(
self, df: pd.DataFrame, return_sub_models_preds: bool = False
) -> Tuple[Union[pd.DataFrame, pd.Series], Union[pd.DataFrame, pd.Series]]:
or_cols = df.columns
prob_cols: list[str] = []
class_cols: list[str] = []
for fn, pipeline in enumerate(self.bluecast_models):
y_probs, y_classes = pipeline.predict(df.loc[:, or_cols])
df[f"proba_{fn}"] = y_probs
df[f"classes_{fn}"] = y_classes
prob_cols.append(f"proba_{fn}")
class_cols.append(f"classes_{fn}")

if return_sub_models_preds:
return df.loc[:, prob_cols], df.loc[:, class_cols]
else:
return df.loc[:, prob_cols].mean(axis=1), df.loc[:, prob_cols].mean(axis=1) > 0.5
2 changes: 1 addition & 1 deletion bluecast/config/training_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ class TrainingConfig:
calculate_shap_values: bool = True
train_size: float = 0.8
train_split_stratify: bool = True
use_full_data_for_final_model: bool = True
use_full_data_for_final_model: bool = False
min_features_to_select: int = 5
cat_encoding_via_ml_algorithm: bool = False

Expand Down
39 changes: 39 additions & 0 deletions bluecast/tests/test_cast_cv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
from typing import Tuple

import pandas as pd
import pytest

from bluecast.blueprints.cast_cv import BlueCastCV
from bluecast.config.training_config import TrainingConfig, XgboostTuneParamsConfig
from bluecast.tests.make_data.create_data import create_synthetic_dataframe


@pytest.fixture
def synthetic_train_test_data() -> Tuple[pd.DataFrame, pd.DataFrame]:
df_train = create_synthetic_dataframe(2000, random_state=20)
df_val = create_synthetic_dataframe(2000, random_state=200)
return df_train, df_val


def test_blueprint_cv_xgboost(synthetic_train_test_data):
"""Test that tests the BlueCast cv class"""
df_train = synthetic_train_test_data[0]
df_val = synthetic_train_test_data[1]
xgboost_param_config = XgboostTuneParamsConfig()
xgboost_param_config.steps_max = 100
xgboost_param_config.num_leaves_max = 16
train_config = TrainingConfig()
train_config.hyperparameter_tuning_rounds = 10

automl_cv = BlueCastCV(
conf_xgboost=xgboost_param_config, conf_training=train_config
)
automl_cv.fit_eval(
df_train,
target_col="target",
)
print("Autotuning successful.")
y_probs, y_classes = automl_cv.predict(df_val.drop("target", axis=1))
print("Predicting successful.")
assert len(y_probs) == len(df_val.index)
assert len(y_classes) == len(df_val.index)

0 comments on commit 594aa74

Please sign in to comment.