Add CV wrapper

ThomasMeissnerDS · Jul 4, 2023 · 594aa74 · 594aa74
1 parent f356a6b
commit 594aa74
Show file tree

Hide file tree

Showing 4 changed files with 161 additions and 3 deletions.
diff --git a/bluecast/blueprints/cast.py b/bluecast/blueprints/cast.py
@@ -44,9 +44,9 @@ class BlueCast:
     :param :class_problem: Takes a string containing the class problem type. Either "binary" or "multiclass".
     :param :target_column: Takes a string containing the name of the target column.
     :param :cat_columns: Takes a list of strings containing the names of the categorical columns. If not provided,
-    BlueCast will infer these automaically.
+    BlueCast will infer these automatically.
     :param :date_columns: Takes a list of strings containing the names of the date columns. If not provided,
-    BlueCast will infer these automaically.
+    BlueCast will infer these automatically.
     :param :time_split_column: Takes a string containing the name of the time split column. If not provided,
     BlueCast will not split the data by time or order, but do a random split instead.
     :param :ml_model: Takes an instance of a XgboostModel class. If not provided, BlueCast will instantiate one.

diff --git a/bluecast/blueprints/cast_cv.py b/bluecast/blueprints/cast_cv.py
@@ -0,0 +1,119 @@
+from typing import Any, List, Literal, Optional, Tuple, Union
+
+import pandas as pd
+from sklearn.model_selection import StratifiedKFold
+
+from bluecast.blueprints.cast import BlueCast
+from bluecast.config.training_config import (
+    TrainingConfig,
+    XgboostFinalParamConfig,
+    XgboostTuneParamsConfig,
+)
+
+
+class BlueCastCV:
+    def __init__(
+        self,
+        class_problem: Literal["binary", "multiclass"] = "binary",
+        conf_training: Optional[TrainingConfig] = None,
+        conf_xgboost: Optional[XgboostTuneParamsConfig] = None,
+        conf_params_xgboost: Optional[XgboostFinalParamConfig] = None,
+    ):
+        self.class_problem = class_problem
+        self.conf_training = conf_training
+        self.conf_xgboost = conf_xgboost
+        self.conf_params_xgboost = conf_params_xgboost
+        self.bluecast_models: List[BlueCast] = []
+
+    def prepare_data(
+        self, df: pd.DataFrame, target: str
+    ) -> Tuple[pd.DataFrame, pd.Series]:
+        y = df[target]
+        X = df.drop(target, axis=1)
+        return X, y
+
+    def fit(self, df: pd.DataFrame, target: str, stratifier: Optional[Any]) -> None:
+        X, y = self.prepare_data(df, target)
+
+        if not self.conf_training:
+            self.conf_training = TrainingConfig()
+
+        if not stratifier:
+            stratifier = StratifiedKFold(
+                n_splits=5,
+                shuffle=True,
+                random_state=self.conf_training.global_random_state,
+            )
+
+        for fn, (trn_idx, val_idx) in enumerate(stratifier.split(X, y)):
+            X_train, X_val = X.iloc[trn_idx], X.iloc[val_idx]
+            y_train, y_val = y.iloc[trn_idx], y.iloc[val_idx]
+            x_train = pd.concat([X_train, X_val], axis=1)
+            y_train = pd.concat([y_train, y_val], axis=1)
+
+            x_train = x_train.reset_index(drop=True)
+            y_train = y_train.reset_index(drop=True)
+            x_train[target] = y_train[target]
+
+            self.conf_training.global_random_state += fn
+
+            automl = BlueCast(
+                class_problem=self.class_problem,
+                target_column=target,
+                conf_training=self.conf_training,
+                conf_xgboost=self.conf_xgboost,
+                conf_params_xgboost=self.conf_params_xgboost,
+            )
+            automl.fit(X_train, target_col=target)
+            self.bluecast_models.append(automl)
+
+    def fit_eval(
+        self, df: pd.DataFrame, target_col: str, stratifier: Optional[Any] = None
+    ) -> None:
+        X, y = self.prepare_data(df, target_col)
+
+        if not self.conf_training:
+            self.conf_training = TrainingConfig()
+
+        if not stratifier:
+            stratifier = StratifiedKFold(
+                n_splits=5,
+                shuffle=True,
+                random_state=self.conf_training.global_random_state,
+            )
+
+        for fn, (trn_idx, val_idx) in enumerate(stratifier.split(X, y)):
+            X_train, X_val = X.iloc[trn_idx], X.iloc[val_idx]
+            y_train, y_val = y.iloc[trn_idx], y.iloc[val_idx]
+
+            X_train[target_col] = y_train
+
+            self.conf_training.global_random_state += fn
+
+            automl = BlueCast(
+                class_problem=self.class_problem,
+                target_column=target_col,
+                conf_training=self.conf_training,
+                conf_xgboost=self.conf_xgboost,
+                conf_params_xgboost=self.conf_params_xgboost,
+            )
+            automl.fit_eval(X_train, X_val, y_val, target_col=target_col)
+            self.bluecast_models.append(automl)
+
+    def predict(
+        self, df: pd.DataFrame, return_sub_models_preds: bool = False
+    ) -> Tuple[Union[pd.DataFrame, pd.Series], Union[pd.DataFrame, pd.Series]]:
+        or_cols = df.columns
+        prob_cols: list[str] = []
+        class_cols: list[str] = []
+        for fn, pipeline in enumerate(self.bluecast_models):
+            y_probs, y_classes = pipeline.predict(df.loc[:, or_cols])
+            df[f"proba_{fn}"] = y_probs
+            df[f"classes_{fn}"] = y_classes
+            prob_cols.append(f"proba_{fn}")
+            class_cols.append(f"classes_{fn}")
+
+        if return_sub_models_preds:
+            return df.loc[:, prob_cols], df.loc[:, class_cols]
+        else:
+            return df.loc[:, prob_cols].mean(axis=1), df.loc[:, prob_cols].mean(axis=1) > 0.5
diff --git a/bluecast/config/training_config.py b/bluecast/config/training_config.py
@@ -29,7 +29,7 @@ class TrainingConfig:
     calculate_shap_values: bool = True
     train_size: float = 0.8
     train_split_stratify: bool = True
-    use_full_data_for_final_model: bool = True
+    use_full_data_for_final_model: bool = False
     min_features_to_select: int = 5
     cat_encoding_via_ml_algorithm: bool = False
 

diff --git a/bluecast/tests/test_cast_cv.py b/bluecast/tests/test_cast_cv.py
@@ -0,0 +1,39 @@
+from typing import Tuple
+
+import pandas as pd
+import pytest
+
+from bluecast.blueprints.cast_cv import BlueCastCV
+from bluecast.config.training_config import TrainingConfig, XgboostTuneParamsConfig
+from bluecast.tests.make_data.create_data import create_synthetic_dataframe
+
+
+@pytest.fixture
+def synthetic_train_test_data() -> Tuple[pd.DataFrame, pd.DataFrame]:
+    df_train = create_synthetic_dataframe(2000, random_state=20)
+    df_val = create_synthetic_dataframe(2000, random_state=200)
+    return df_train, df_val
+
+
+def test_blueprint_cv_xgboost(synthetic_train_test_data):
+    """Test that tests the BlueCast cv class"""
+    df_train = synthetic_train_test_data[0]
+    df_val = synthetic_train_test_data[1]
+    xgboost_param_config = XgboostTuneParamsConfig()
+    xgboost_param_config.steps_max = 100
+    xgboost_param_config.num_leaves_max = 16
+    train_config = TrainingConfig()
+    train_config.hyperparameter_tuning_rounds = 10
+
+    automl_cv = BlueCastCV(
+        conf_xgboost=xgboost_param_config, conf_training=train_config
+    )
+    automl_cv.fit_eval(
+        df_train,
+        target_col="target",
+    )
+    print("Autotuning successful.")
+    y_probs, y_classes = automl_cv.predict(df_val.drop("target", axis=1))
+    print("Predicting successful.")
+    assert len(y_probs) == len(df_val.index)
+    assert len(y_classes) == len(df_val.index)