diff --git a/bluecast/blueprints/cast.py b/bluecast/blueprints/cast.py index 7b3fd826..b70881ad 100644 --- a/bluecast/blueprints/cast.py +++ b/bluecast/blueprints/cast.py @@ -44,9 +44,9 @@ class BlueCast: :param :class_problem: Takes a string containing the class problem type. Either "binary" or "multiclass". :param :target_column: Takes a string containing the name of the target column. :param :cat_columns: Takes a list of strings containing the names of the categorical columns. If not provided, - BlueCast will infer these automaically. + BlueCast will infer these automatically. :param :date_columns: Takes a list of strings containing the names of the date columns. If not provided, - BlueCast will infer these automaically. + BlueCast will infer these automatically. :param :time_split_column: Takes a string containing the name of the time split column. If not provided, BlueCast will not split the data by time or order, but do a random split instead. :param :ml_model: Takes an instance of a XgboostModel class. If not provided, BlueCast will instantiate one. diff --git a/bluecast/blueprints/cast_cv.py b/bluecast/blueprints/cast_cv.py new file mode 100644 index 00000000..d1cdee7c --- /dev/null +++ b/bluecast/blueprints/cast_cv.py @@ -0,0 +1,119 @@ +from typing import Any, List, Literal, Optional, Tuple, Union + +import pandas as pd +from sklearn.model_selection import StratifiedKFold + +from bluecast.blueprints.cast import BlueCast +from bluecast.config.training_config import ( + TrainingConfig, + XgboostFinalParamConfig, + XgboostTuneParamsConfig, +) + + +class BlueCastCV: + def __init__( + self, + class_problem: Literal["binary", "multiclass"] = "binary", + conf_training: Optional[TrainingConfig] = None, + conf_xgboost: Optional[XgboostTuneParamsConfig] = None, + conf_params_xgboost: Optional[XgboostFinalParamConfig] = None, + ): + self.class_problem = class_problem + self.conf_training = conf_training + self.conf_xgboost = conf_xgboost + self.conf_params_xgboost = conf_params_xgboost + self.bluecast_models: List[BlueCast] = [] + + def prepare_data( + self, df: pd.DataFrame, target: str + ) -> Tuple[pd.DataFrame, pd.Series]: + y = df[target] + X = df.drop(target, axis=1) + return X, y + + def fit(self, df: pd.DataFrame, target: str, stratifier: Optional[Any]) -> None: + X, y = self.prepare_data(df, target) + + if not self.conf_training: + self.conf_training = TrainingConfig() + + if not stratifier: + stratifier = StratifiedKFold( + n_splits=5, + shuffle=True, + random_state=self.conf_training.global_random_state, + ) + + for fn, (trn_idx, val_idx) in enumerate(stratifier.split(X, y)): + X_train, X_val = X.iloc[trn_idx], X.iloc[val_idx] + y_train, y_val = y.iloc[trn_idx], y.iloc[val_idx] + x_train = pd.concat([X_train, X_val], axis=1) + y_train = pd.concat([y_train, y_val], axis=1) + + x_train = x_train.reset_index(drop=True) + y_train = y_train.reset_index(drop=True) + x_train[target] = y_train[target] + + self.conf_training.global_random_state += fn + + automl = BlueCast( + class_problem=self.class_problem, + target_column=target, + conf_training=self.conf_training, + conf_xgboost=self.conf_xgboost, + conf_params_xgboost=self.conf_params_xgboost, + ) + automl.fit(X_train, target_col=target) + self.bluecast_models.append(automl) + + def fit_eval( + self, df: pd.DataFrame, target_col: str, stratifier: Optional[Any] = None + ) -> None: + X, y = self.prepare_data(df, target_col) + + if not self.conf_training: + self.conf_training = TrainingConfig() + + if not stratifier: + stratifier = StratifiedKFold( + n_splits=5, + shuffle=True, + random_state=self.conf_training.global_random_state, + ) + + for fn, (trn_idx, val_idx) in enumerate(stratifier.split(X, y)): + X_train, X_val = X.iloc[trn_idx], X.iloc[val_idx] + y_train, y_val = y.iloc[trn_idx], y.iloc[val_idx] + + X_train[target_col] = y_train + + self.conf_training.global_random_state += fn + + automl = BlueCast( + class_problem=self.class_problem, + target_column=target_col, + conf_training=self.conf_training, + conf_xgboost=self.conf_xgboost, + conf_params_xgboost=self.conf_params_xgboost, + ) + automl.fit_eval(X_train, X_val, y_val, target_col=target_col) + self.bluecast_models.append(automl) + + def predict( + self, df: pd.DataFrame, return_sub_models_preds: bool = False + ) -> Tuple[Union[pd.DataFrame, pd.Series], Union[pd.DataFrame, pd.Series]]: + or_cols = df.columns + prob_cols: list[str] = [] + class_cols: list[str] = [] + for fn, pipeline in enumerate(self.bluecast_models): + y_probs, y_classes = pipeline.predict(df.loc[:, or_cols]) + df[f"proba_{fn}"] = y_probs + df[f"classes_{fn}"] = y_classes + prob_cols.append(f"proba_{fn}") + class_cols.append(f"classes_{fn}") + + if return_sub_models_preds: + return df.loc[:, prob_cols], df.loc[:, class_cols] + else: + return df.loc[:, prob_cols].mean(axis=1), df.loc[:, prob_cols].mean(axis=1) > 0.5 diff --git a/bluecast/config/training_config.py b/bluecast/config/training_config.py index 48473793..8b45b5fb 100644 --- a/bluecast/config/training_config.py +++ b/bluecast/config/training_config.py @@ -29,7 +29,7 @@ class TrainingConfig: calculate_shap_values: bool = True train_size: float = 0.8 train_split_stratify: bool = True - use_full_data_for_final_model: bool = True + use_full_data_for_final_model: bool = False min_features_to_select: int = 5 cat_encoding_via_ml_algorithm: bool = False diff --git a/bluecast/tests/test_cast_cv.py b/bluecast/tests/test_cast_cv.py new file mode 100644 index 00000000..13b188f6 --- /dev/null +++ b/bluecast/tests/test_cast_cv.py @@ -0,0 +1,39 @@ +from typing import Tuple + +import pandas as pd +import pytest + +from bluecast.blueprints.cast_cv import BlueCastCV +from bluecast.config.training_config import TrainingConfig, XgboostTuneParamsConfig +from bluecast.tests.make_data.create_data import create_synthetic_dataframe + + +@pytest.fixture +def synthetic_train_test_data() -> Tuple[pd.DataFrame, pd.DataFrame]: + df_train = create_synthetic_dataframe(2000, random_state=20) + df_val = create_synthetic_dataframe(2000, random_state=200) + return df_train, df_val + + +def test_blueprint_cv_xgboost(synthetic_train_test_data): + """Test that tests the BlueCast cv class""" + df_train = synthetic_train_test_data[0] + df_val = synthetic_train_test_data[1] + xgboost_param_config = XgboostTuneParamsConfig() + xgboost_param_config.steps_max = 100 + xgboost_param_config.num_leaves_max = 16 + train_config = TrainingConfig() + train_config.hyperparameter_tuning_rounds = 10 + + automl_cv = BlueCastCV( + conf_xgboost=xgboost_param_config, conf_training=train_config + ) + automl_cv.fit_eval( + df_train, + target_col="target", + ) + print("Autotuning successful.") + y_probs, y_classes = automl_cv.predict(df_val.drop("target", axis=1)) + print("Predicting successful.") + assert len(y_probs) == len(df_val.index) + assert len(y_classes) == len(df_val.index)