-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
f356a6b
commit 594aa74
Showing
4 changed files
with
161 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,119 @@ | ||
from typing import Any, List, Literal, Optional, Tuple, Union | ||
|
||
import pandas as pd | ||
from sklearn.model_selection import StratifiedKFold | ||
|
||
from bluecast.blueprints.cast import BlueCast | ||
from bluecast.config.training_config import ( | ||
TrainingConfig, | ||
XgboostFinalParamConfig, | ||
XgboostTuneParamsConfig, | ||
) | ||
|
||
|
||
class BlueCastCV: | ||
def __init__( | ||
self, | ||
class_problem: Literal["binary", "multiclass"] = "binary", | ||
conf_training: Optional[TrainingConfig] = None, | ||
conf_xgboost: Optional[XgboostTuneParamsConfig] = None, | ||
conf_params_xgboost: Optional[XgboostFinalParamConfig] = None, | ||
): | ||
self.class_problem = class_problem | ||
self.conf_training = conf_training | ||
self.conf_xgboost = conf_xgboost | ||
self.conf_params_xgboost = conf_params_xgboost | ||
self.bluecast_models: List[BlueCast] = [] | ||
|
||
def prepare_data( | ||
self, df: pd.DataFrame, target: str | ||
) -> Tuple[pd.DataFrame, pd.Series]: | ||
y = df[target] | ||
X = df.drop(target, axis=1) | ||
return X, y | ||
|
||
def fit(self, df: pd.DataFrame, target: str, stratifier: Optional[Any]) -> None: | ||
X, y = self.prepare_data(df, target) | ||
|
||
if not self.conf_training: | ||
self.conf_training = TrainingConfig() | ||
|
||
if not stratifier: | ||
stratifier = StratifiedKFold( | ||
n_splits=5, | ||
shuffle=True, | ||
random_state=self.conf_training.global_random_state, | ||
) | ||
|
||
for fn, (trn_idx, val_idx) in enumerate(stratifier.split(X, y)): | ||
X_train, X_val = X.iloc[trn_idx], X.iloc[val_idx] | ||
y_train, y_val = y.iloc[trn_idx], y.iloc[val_idx] | ||
x_train = pd.concat([X_train, X_val], axis=1) | ||
y_train = pd.concat([y_train, y_val], axis=1) | ||
|
||
x_train = x_train.reset_index(drop=True) | ||
y_train = y_train.reset_index(drop=True) | ||
x_train[target] = y_train[target] | ||
|
||
self.conf_training.global_random_state += fn | ||
|
||
automl = BlueCast( | ||
class_problem=self.class_problem, | ||
target_column=target, | ||
conf_training=self.conf_training, | ||
conf_xgboost=self.conf_xgboost, | ||
conf_params_xgboost=self.conf_params_xgboost, | ||
) | ||
automl.fit(X_train, target_col=target) | ||
self.bluecast_models.append(automl) | ||
|
||
def fit_eval( | ||
self, df: pd.DataFrame, target_col: str, stratifier: Optional[Any] = None | ||
) -> None: | ||
X, y = self.prepare_data(df, target_col) | ||
|
||
if not self.conf_training: | ||
self.conf_training = TrainingConfig() | ||
|
||
if not stratifier: | ||
stratifier = StratifiedKFold( | ||
n_splits=5, | ||
shuffle=True, | ||
random_state=self.conf_training.global_random_state, | ||
) | ||
|
||
for fn, (trn_idx, val_idx) in enumerate(stratifier.split(X, y)): | ||
X_train, X_val = X.iloc[trn_idx], X.iloc[val_idx] | ||
y_train, y_val = y.iloc[trn_idx], y.iloc[val_idx] | ||
|
||
X_train[target_col] = y_train | ||
|
||
self.conf_training.global_random_state += fn | ||
|
||
automl = BlueCast( | ||
class_problem=self.class_problem, | ||
target_column=target_col, | ||
conf_training=self.conf_training, | ||
conf_xgboost=self.conf_xgboost, | ||
conf_params_xgboost=self.conf_params_xgboost, | ||
) | ||
automl.fit_eval(X_train, X_val, y_val, target_col=target_col) | ||
self.bluecast_models.append(automl) | ||
|
||
def predict( | ||
self, df: pd.DataFrame, return_sub_models_preds: bool = False | ||
) -> Tuple[Union[pd.DataFrame, pd.Series], Union[pd.DataFrame, pd.Series]]: | ||
or_cols = df.columns | ||
prob_cols: list[str] = [] | ||
class_cols: list[str] = [] | ||
for fn, pipeline in enumerate(self.bluecast_models): | ||
y_probs, y_classes = pipeline.predict(df.loc[:, or_cols]) | ||
df[f"proba_{fn}"] = y_probs | ||
df[f"classes_{fn}"] = y_classes | ||
prob_cols.append(f"proba_{fn}") | ||
class_cols.append(f"classes_{fn}") | ||
|
||
if return_sub_models_preds: | ||
return df.loc[:, prob_cols], df.loc[:, class_cols] | ||
else: | ||
return df.loc[:, prob_cols].mean(axis=1), df.loc[:, prob_cols].mean(axis=1) > 0.5 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
from typing import Tuple | ||
|
||
import pandas as pd | ||
import pytest | ||
|
||
from bluecast.blueprints.cast_cv import BlueCastCV | ||
from bluecast.config.training_config import TrainingConfig, XgboostTuneParamsConfig | ||
from bluecast.tests.make_data.create_data import create_synthetic_dataframe | ||
|
||
|
||
@pytest.fixture | ||
def synthetic_train_test_data() -> Tuple[pd.DataFrame, pd.DataFrame]: | ||
df_train = create_synthetic_dataframe(2000, random_state=20) | ||
df_val = create_synthetic_dataframe(2000, random_state=200) | ||
return df_train, df_val | ||
|
||
|
||
def test_blueprint_cv_xgboost(synthetic_train_test_data): | ||
"""Test that tests the BlueCast cv class""" | ||
df_train = synthetic_train_test_data[0] | ||
df_val = synthetic_train_test_data[1] | ||
xgboost_param_config = XgboostTuneParamsConfig() | ||
xgboost_param_config.steps_max = 100 | ||
xgboost_param_config.num_leaves_max = 16 | ||
train_config = TrainingConfig() | ||
train_config.hyperparameter_tuning_rounds = 10 | ||
|
||
automl_cv = BlueCastCV( | ||
conf_xgboost=xgboost_param_config, conf_training=train_config | ||
) | ||
automl_cv.fit_eval( | ||
df_train, | ||
target_col="target", | ||
) | ||
print("Autotuning successful.") | ||
y_probs, y_classes = automl_cv.predict(df_val.drop("target", axis=1)) | ||
print("Predicting successful.") | ||
assert len(y_probs) == len(df_val.index) | ||
assert len(y_classes) == len(df_val.index) |