Skip to content

Commit

Permalink
Merge pull request #8 from ThomasMeissnerDS/change_feature_selection_api
Browse files Browse the repository at this point in the history
Change feature selection api
  • Loading branch information
ThomasMeissnerDS authored Jun 25, 2023
2 parents 9fb604c + 05d37cc commit f47d7c0
Show file tree
Hide file tree
Showing 13 changed files with 213 additions and 126 deletions.
49 changes: 34 additions & 15 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@ only) and a few preprocessing options (only what is
needed for Xgboost). This allows for a much faster development
cycle and a much more stable codebase while also having as few dependencies
as possible for the library. Despite being lightweight in its core BlueCast
offers high customization options for advanced users.
offers high customization options for advanced users. Find
the full documentation [here](https://bluecast.readthedocs.io/en/latest/).

<!-- toc -->

Expand Down Expand Up @@ -267,41 +268,59 @@ automl.fit(df_train, target_col="target")
y_probs, y_classes = automl.predict(df_val)
```
Also this step can be customized. An instance of `RFECV` is expected for `selection_strategy`.
Otherwise the pipeline will fail. To surpass the `RFECV` limitation a custom feature
selection algorithm can also be passed as part of a custom last mile computation.
Here is an example adjusting the in-built solution via `RFECV`:
Also this step can be customized. The following example shows how to:
```sh
from bluecast.config.training_config import FeatureSelectionConfig
from bluecast.config.training_config import TrainingConfig
from bluecast.preprocessing.custom import CustomPreprocessing
from sklearn.feature_selection import RFECV
from sklearn.metrics import make_scorer, matthews_corrcoef
from sklearn.model_selection import StratifiedKFold
from typing import Optional, Tuple


# Create a custom training config and adjust general training parameters
train_config = TrainingConfig()
train_config.enable_feature_selection = True

# add custom feature selection
custom_feat_sel = FeatureSelectionConfig()
# custom_feat_sel.execute_selection = False
custom_feat_sel.selection_strategy = RFECV(
estimator=xgb.XGBClassifier(),
step=1,
cv=StratifiedKFold(10, random_state=0, shuffle=True),
min_features_to_select=1,
scoring=make_scorer(matthews_corrcoef),
n_jobs=1,
)
class RFECVSelector(CustomPreprocessing):
def __init__(self, random_state: int = 0):
super().__init__()
self.selected_features = None
self.random_state = random_state
self.selection_strategy: RFECV = RFECV(
estimator=xgb.XGBClassifier(),
step=1,
cv=StratifiedKFold(5, random_state=random_state, shuffle=True),
min_features_to_select=1,
scoring=make_scorer(matthews_corrcoef),
n_jobs=2,
)

def fit_transform(self, df: pd.DataFrame, target: pd.Series) -> Tuple[pd.DataFrame, Optional[pd.Series]]:
self.selection_strategy.fit(df, target)
self.selected_features = self.selection_strategy.support_
df = df.loc[:, self.selected_features]
return df, target

def transform(self,
df: pd.DataFrame,
target: Optional[pd.Series] = None,
predicton_mode: bool = False) -> Tuple[pd.DataFrame, Optional[pd.Series]]:
df = df.loc[:, self.selected_features]
return df, target

custom_feature_selector = RFECVSelector()

# Create an instance of the BlueCast class with the custom model
bluecast = BlueCast(
class_problem="binary",
target_column="target",
conf_feature_selection=custom_feat_sel,
conf_training=train_config,
custom_feature_selector=custom_feature_selector,

# Create some sample data for testing
x_train = pd.DataFrame(
Expand Down
107 changes: 70 additions & 37 deletions bluecast/blueprints/cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,25 +7,25 @@
via the config class attributes from config.training_config module.
"""
import warnings
from datetime import datetime
from typing import Any, Dict, List, Literal, Optional, Tuple, Union

import numpy as np
import pandas as pd

from bluecast.config.training_config import (
FeatureSelectionConfig,
TrainingConfig,
XgboostFinalParamConfig,
XgboostTuneParamsConfig,
)
from bluecast.evaluation.eval_metrics import eval_classifier
from bluecast.evaluation.shap_values import shap_explanations
from bluecast.general_utils.general_utils import check_gpu_support
from bluecast.general_utils.general_utils import check_gpu_support, logger
from bluecast.ml_modelling.xgboost import XgboostModel
from bluecast.preprocessing.custom import CustomPreprocessing
from bluecast.preprocessing.datetime_features import date_converter
from bluecast.preprocessing.encode_target_labels import TargetLabelEncoder
from bluecast.preprocessing.feature_selection import FeatureSelector
from bluecast.preprocessing.feature_selection import RFECVSelector
from bluecast.preprocessing.feature_types import FeatureTypeDetector
from bluecast.preprocessing.nulls_and_infs import fill_infinite_values
from bluecast.preprocessing.schema_checks import SchemaDetector
Expand Down Expand Up @@ -67,10 +67,12 @@ def __init__(
ml_model: Optional[Union[XgboostModel, Any]] = None,
custom_last_mile_computation: Optional[CustomPreprocessing] = None,
custom_preprocessor: Optional[CustomPreprocessing] = None,
custom_feature_selector: Optional[
Union[RFECVSelector, CustomPreprocessing]
] = None,
conf_training: Optional[TrainingConfig] = None,
conf_xgboost: Optional[XgboostTuneParamsConfig] = None,
conf_params_xgboost: Optional[XgboostFinalParamConfig] = None,
conf_feature_selection: Optional[FeatureSelectionConfig] = None,
):
self.class_problem = class_problem
self.prediction_mode: bool = False
Expand All @@ -81,8 +83,6 @@ def __init__(
self.conf_training = conf_training
self.conf_xgboost = conf_xgboost
self.conf_params_xgboost = conf_params_xgboost
self.conf_feature_selection = conf_feature_selection
self.feature_selector: Optional[FeatureSelector] = None
self.feat_type_detector: Optional[FeatureTypeDetector] = None
self.cat_encoder: Optional[
Union[BinaryClassTargetEncoder, MultiClassTargetEncoder]
Expand All @@ -92,8 +92,54 @@ def __init__(
self.ml_model: Optional[XgboostModel] = ml_model
self.custom_last_mile_computation = custom_last_mile_computation
self.custom_preprocessor = custom_preprocessor
self.custom_feature_selector = custom_feature_selector
self.shap_values: Optional[np.ndarray] = None

def initial_checks(self, df: pd.DataFrame) -> None:
if not self.conf_training:
self.conf_training = TrainingConfig()
if not self.conf_training.enable_feature_selection:
message = """Feature selection is disabled. Update the TrainingConfig param 'enable_feature_selection'
to enable it or make use of a custom preprocessor to do it manually during the last mile computations step.
Feature selection is recommended for datasets with many features (>1000). For datasets with a small amount
of features feature selection is not recommended.
"""
warnings.warn(message, UserWarning, stacklevel=2)

if self.conf_training.hypertuning_cv_folds == 1:
message = """Cross validation is disabled. Update the TrainingConfig param 'hypertuning_cv_folds'
to enable it. Cross validation is disabled on default to allow fast prototyping. For robust hyperparameter
tuning using at least 5 folds is recommended."""
warnings.warn(message, UserWarning, stacklevel=2)

if (
self.conf_training.enable_feature_selection
and not self.custom_feature_selector
):
message = """Feature selection is enabled but no feature selector has been provided. Falling back to
cross-validated feature elimination. Specifically for small datasets check the logs to verify that not too
many features have been removed. Otherwise, consider disabling feature selection or providing a custom
feature selector."""
warnings.warn(message, UserWarning, stacklevel=2)
if not self.conf_xgboost:
message = """No XgboostTuneParamsConfig has been provided. Falling back to default values. Default values
have been chosen to speed up the prototyping. For robust hyperparameter tuning consider providing a custom
XgboostTuneParamsConfig with a deeper hyperparameter search space and a custom TrainingConfig to enable
cross-validation."""
warnings.warn(message, UserWarning, stacklevel=2)
if (
self.conf_training.min_features_to_select >= len(df.columns)
and self.conf_training.enable_feature_selection
):
message = """The minimum number of features to select is greater or equal to the number of features in
the dataset while feature selection is enabled. Consider reducing the minimum number of features to
select or disabling feature selection via TrainingConfig."""
warnings.warn(message, UserWarning, stacklevel=2)
if self.target_column in df.columns:
message = """The target column is present in the dataset. Consider removing the target column from the
dataset to prevent leakage."""
warnings.warn(message, UserWarning, stacklevel=2)

def fit(self, df: pd.DataFrame, target_col: str) -> None:
"""Train a full ML pipeline."""
check_gpu_support()
Expand All @@ -116,19 +162,7 @@ def fit(self, df: pd.DataFrame, target_col: str) -> None:
if not self.conf_training:
self.conf_training = TrainingConfig()

if not self.conf_training.enable_feature_selection:
message = """Feature selection is disabled. Update the TrainingConfig param 'enable_feature_selection'
to enable it or make use of a custom preprocessor to do it manually during the last mile computations step.
Feature selection is recommended for datasets with many features (>1000). For datasets with a small amount
of features feature selection is not recommended.
"""
warnings.warn(message, UserWarning, stacklevel=2)

if self.conf_training.hypertuning_cv_folds == 1:
message = """Cross validation is disabled. Update the TrainingConfig param 'hypertuning_cv_folds'
to enable it. Cross validation is disabled on default to allow fast prototyping. For robust hyperparameter
tuning using at least 5 folds is recommended."""
warnings.warn(message, UserWarning, stacklevel=2)
self.initial_checks(df)

x_train, x_test, y_train, y_test = train_test_split(
df,
Expand Down Expand Up @@ -181,17 +215,19 @@ def fit(self, df: pd.DataFrame, target_col: str) -> None:
x_test, y_test, predicton_mode=False
)

if not self.conf_feature_selection:
self.conf_feature_selection = FeatureSelectionConfig()
if not self.custom_feature_selector:
self.custom_feature_selector = RFECVSelector(
random_state=self.conf_training.global_random_state,
min_features_to_select=self.conf_training.min_features_to_select,
)

if self.conf_training.enable_feature_selection:
self.feature_selector = FeatureSelector(
selection_strategy=self.conf_feature_selection.selection_strategy
x_train, y_train = self.custom_feature_selector.fit_transform(
x_train, y_train
)
x_test, _ = self.custom_feature_selector.transform(
x_test, predicton_mode=False
)

if self.feature_selector and self.conf_training.enable_feature_selection:
x_train = self.feature_selector.fit_transform(x_train, y_train)
x_test = self.feature_selector.transform(x_test)

if not self.ml_model:
self.ml_model = XgboostModel(
Expand Down Expand Up @@ -223,7 +259,7 @@ def fit_eval(
"""
self.fit(df, target_col)
y_probs, y_classes = self.predict(df_eval)
eval_dict = eval_classifier(target_eval.values, y_classes)
eval_dict = eval_classifier(target_eval.values, y_probs, y_classes)
return eval_dict

def transform_new_data(self, df: pd.DataFrame) -> pd.DataFrame:
Expand All @@ -232,6 +268,9 @@ def transform_new_data(self, df: pd.DataFrame) -> pd.DataFrame:
if not self.feat_type_detector:
raise Exception("Feature type converter could not be found.")

if not self.conf_training:
raise Exception("Training configuration could not be found.")

df = self.feat_type_detector.transform_feature_types(
df, ignore_cols=[self.target_column]
)
Expand Down Expand Up @@ -264,14 +303,8 @@ def transform_new_data(self, df: pd.DataFrame) -> pd.DataFrame:
if self.custom_last_mile_computation:
df, _ = self.custom_last_mile_computation.transform(df, predicton_mode=True)

if not self.conf_feature_selection:
self.conf_feature_selection = FeatureSelectionConfig()

if not self.conf_training:
self.conf_training = TrainingConfig()

if self.feature_selector and self.conf_training.enable_feature_selection:
df = self.feature_selector.transform(df)
if self.custom_feature_selector and self.conf_training.enable_feature_selection:
df, _ = self.custom_feature_selector.transform(df, predicton_mode=True)

return df

Expand All @@ -290,7 +323,7 @@ def predict(self, df: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray]:
check_gpu_support()
df = self.transform_new_data(df)

print("Predicting...")
logger(f"{datetime.utcnow()}: Predicting...")
y_probs, y_classes = self.ml_model.predict(df)

if self.feat_type_detector.cat_columns:
Expand Down
19 changes: 1 addition & 18 deletions bluecast/config/training_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,7 @@
"""
from typing import Dict, Optional

import xgboost as xgb
from pydantic.dataclasses import dataclass
from sklearn.feature_selection import RFECV
from sklearn.metrics import make_scorer, matthews_corrcoef
from sklearn.model_selection import StratifiedKFold


class Config:
Expand All @@ -34,20 +30,7 @@ class TrainingConfig:
train_size: float = 0.8
train_split_stratify: bool = True
use_full_data_for_final_model: bool = True


@dataclass(config=Config)
class FeatureSelectionConfig:
"""Define feature selection parameters."""

selection_strategy: RFECV = RFECV(
estimator=xgb.XGBClassifier(),
step=1,
cv=StratifiedKFold(5, random_state=0, shuffle=True),
min_features_to_select=1,
scoring=make_scorer(matthews_corrcoef),
n_jobs=4,
)
min_features_to_select: int = 5


@dataclass
Expand Down
18 changes: 7 additions & 11 deletions bluecast/evaluation/eval_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,34 +28,30 @@ def balanced_log_loss(y_true, y_pred):
return (l0.mean() + l1.mean()) / 2


def eval_classifier(y_true: np.ndarray, y_classes: np.ndarray) -> Dict[str, Any]:
def eval_classifier(
y_true: np.ndarray, y_probs: np.ndarray, y_classes: np.ndarray
) -> Dict[str, Any]:
try:
matthews = matthews_corrcoef(y_true, y_classes)
except Exception:
matthews = 0

print(f"The Matthew correlation is {matthews}")
logger(f"The Matthew correlation is {matthews}")
print("-------------------")
accuracy = accuracy_score(y_true, y_classes)
print(f"The accuracy is {accuracy}")
logger(f"The accuracy is {accuracy}")
recall = recall_score(y_true, y_classes, average="weighted")
print(f"The recall is {recall}")
logger(f"The recall is {recall}")
f1_score_macro = f1_score(y_true, y_classes, average="macro", zero_division=0)
print(f"The macro F1 score is {f1_score_macro}")
logger(f"The macro F1 score is {f1_score_macro}")
f1_score_micro = f1_score(y_true, y_classes, average="micro", zero_division=0)
print(f"The micro F1 score is {f1_score_micro}")
logger(f"The micro F1 score is {f1_score_micro}")
f1_score_weighted = f1_score(y_true, y_classes, average="weighted", zero_division=0)
print(f"The weighted F1 score is {f1_score_weighted}")
logger(f"The weighted F1 score is {f1_score_weighted}")
bll = balanced_log_loss(y_true, y_classes)
print(f"The balanced logloss is {bll}")
bll = balanced_log_loss(y_true, y_probs)
logger(f"The balanced logloss is {bll}")

full_classification_report = classification_report(y_true, y_classes)
print(full_classification_report)
logger(full_classification_report)

evaluation_scores = {
"matthews": matthews,
Expand Down
Loading

0 comments on commit f47d7c0

Please sign in to comment.