Merge pull request #8 from ThomasMeissnerDS/change_feature_selection_api

Change feature selection api
ThomasMeissnerDS · Jun 25, 2023 · f47d7c0 · f47d7c0
2 parents 9fb604c + 05d37cc
commit f47d7c0
Show file tree

Hide file tree

Showing 13 changed files with 213 additions and 126 deletions.
diff --git a/README.md b/README.md
@@ -21,7 +21,8 @@ only) and a few preprocessing options (only what is
 needed for Xgboost). This allows for a much faster development
 cycle and a much more stable codebase while also having as few dependencies
 as possible for the library. Despite being lightweight in its core BlueCast
-offers high customization options for advanced users.
+offers high customization options for advanced users. Find
+the full documentation [here](https://bluecast.readthedocs.io/en/latest/).
 
 <!-- toc -->
 
@@ -267,41 +268,59 @@ automl.fit(df_train, target_col="target")
 y_probs, y_classes = automl.predict(df_val)
 ```
 
-Also this step can be customized. An instance of `RFECV` is expected for `selection_strategy`.
-Otherwise the pipeline will fail. To surpass the `RFECV` limitation a custom feature
-selection algorithm can also be passed as part of a custom last mile computation.
-Here is an example adjusting the in-built solution via `RFECV`:
+Also this step can be customized. The following example shows how to:
 
 ```sh
 from bluecast.config.training_config import FeatureSelectionConfig
 from bluecast.config.training_config import TrainingConfig
+from bluecast.preprocessing.custom import CustomPreprocessing
 from sklearn.feature_selection import RFECV
 from sklearn.metrics import make_scorer, matthews_corrcoef
 from sklearn.model_selection import StratifiedKFold
+from typing import Optional, Tuple
 
 
 # Create a custom training config and adjust general training parameters
 train_config = TrainingConfig()
 train_config.enable_feature_selection = True
 
 # add custom feature selection
-custom_feat_sel = FeatureSelectionConfig()
-# custom_feat_sel.execute_selection = False
-custom_feat_sel.selection_strategy = RFECV(
-    estimator=xgb.XGBClassifier(),
-    step=1,
-    cv=StratifiedKFold(10, random_state=0, shuffle=True),
-    min_features_to_select=1,
-    scoring=make_scorer(matthews_corrcoef),
-    n_jobs=1,
-)
+class RFECVSelector(CustomPreprocessing):
+    def __init__(self, random_state: int = 0):
+        super().__init__()
+        self.selected_features = None
+        self.random_state = random_state
+        self.selection_strategy: RFECV = RFECV(
+            estimator=xgb.XGBClassifier(),
+            step=1,
+            cv=StratifiedKFold(5, random_state=random_state, shuffle=True),
+            min_features_to_select=1,
+            scoring=make_scorer(matthews_corrcoef),
+            n_jobs=2,
+        )
+
+    def fit_transform(self, df: pd.DataFrame, target: pd.Series) -> Tuple[pd.DataFrame, Optional[pd.Series]]:
+        self.selection_strategy.fit(df, target)
+        self.selected_features = self.selection_strategy.support_
+        df = df.loc[:, self.selected_features]
+        return df, target
+
+    def transform(self,
+                  df: pd.DataFrame,
+                  target: Optional[pd.Series] = None,
+                  predicton_mode: bool = False) -> Tuple[pd.DataFrame, Optional[pd.Series]]:
+        df = df.loc[:, self.selected_features]
+        return df, target
+
+custom_feature_selector = RFECVSelector()
 
 # Create an instance of the BlueCast class with the custom model
 bluecast = BlueCast(
     class_problem="binary",
     target_column="target",
     conf_feature_selection=custom_feat_sel,
     conf_training=train_config,
+    custom_feature_selector=custom_feature_selector,
 
 # Create some sample data for testing
 x_train = pd.DataFrame(

diff --git a/bluecast/blueprints/cast.py b/bluecast/blueprints/cast.py
@@ -7,25 +7,25 @@
 via the config class attributes from config.training_config module.
 """
 import warnings
+from datetime import datetime
 from typing import Any, Dict, List, Literal, Optional, Tuple, Union
 
 import numpy as np
 import pandas as pd
 
 from bluecast.config.training_config import (
-    FeatureSelectionConfig,
     TrainingConfig,
     XgboostFinalParamConfig,
     XgboostTuneParamsConfig,
 )
 from bluecast.evaluation.eval_metrics import eval_classifier
 from bluecast.evaluation.shap_values import shap_explanations
-from bluecast.general_utils.general_utils import check_gpu_support
+from bluecast.general_utils.general_utils import check_gpu_support, logger
 from bluecast.ml_modelling.xgboost import XgboostModel
 from bluecast.preprocessing.custom import CustomPreprocessing
 from bluecast.preprocessing.datetime_features import date_converter
 from bluecast.preprocessing.encode_target_labels import TargetLabelEncoder
-from bluecast.preprocessing.feature_selection import FeatureSelector
+from bluecast.preprocessing.feature_selection import RFECVSelector
 from bluecast.preprocessing.feature_types import FeatureTypeDetector
 from bluecast.preprocessing.nulls_and_infs import fill_infinite_values
 from bluecast.preprocessing.schema_checks import SchemaDetector
@@ -67,10 +67,12 @@ def __init__(
         ml_model: Optional[Union[XgboostModel, Any]] = None,
         custom_last_mile_computation: Optional[CustomPreprocessing] = None,
         custom_preprocessor: Optional[CustomPreprocessing] = None,
+        custom_feature_selector: Optional[
+            Union[RFECVSelector, CustomPreprocessing]
+        ] = None,
         conf_training: Optional[TrainingConfig] = None,
         conf_xgboost: Optional[XgboostTuneParamsConfig] = None,
         conf_params_xgboost: Optional[XgboostFinalParamConfig] = None,
-        conf_feature_selection: Optional[FeatureSelectionConfig] = None,
     ):
         self.class_problem = class_problem
         self.prediction_mode: bool = False
@@ -81,8 +83,6 @@ def __init__(
         self.conf_training = conf_training
         self.conf_xgboost = conf_xgboost
         self.conf_params_xgboost = conf_params_xgboost
-        self.conf_feature_selection = conf_feature_selection
-        self.feature_selector: Optional[FeatureSelector] = None
         self.feat_type_detector: Optional[FeatureTypeDetector] = None
         self.cat_encoder: Optional[
             Union[BinaryClassTargetEncoder, MultiClassTargetEncoder]
@@ -92,8 +92,54 @@ def __init__(
         self.ml_model: Optional[XgboostModel] = ml_model
         self.custom_last_mile_computation = custom_last_mile_computation
         self.custom_preprocessor = custom_preprocessor
+        self.custom_feature_selector = custom_feature_selector
         self.shap_values: Optional[np.ndarray] = None
 
+    def initial_checks(self, df: pd.DataFrame) -> None:
+        if not self.conf_training:
+            self.conf_training = TrainingConfig()
+        if not self.conf_training.enable_feature_selection:
+            message = """Feature selection is disabled. Update the TrainingConfig param 'enable_feature_selection'
+            to enable it or make use of a custom preprocessor to do it manually during the last mile computations step.
+            Feature selection is recommended for datasets with many features (>1000). For datasets with a small amount
+            of features feature selection is not recommended.
+            """
+            warnings.warn(message, UserWarning, stacklevel=2)
+
+        if self.conf_training.hypertuning_cv_folds == 1:
+            message = """Cross validation is disabled. Update the TrainingConfig param 'hypertuning_cv_folds'
+            to enable it. Cross validation is disabled on default to allow fast prototyping. For robust hyperparameter
+            tuning using at least 5 folds is recommended."""
+            warnings.warn(message, UserWarning, stacklevel=2)
+
+        if (
+            self.conf_training.enable_feature_selection
+            and not self.custom_feature_selector
+        ):
+            message = """Feature selection is enabled but no feature selector has been provided. Falling back to
+            cross-validated feature elimination. Specifically for small datasets check the logs to verify that not too
+            many features have been removed. Otherwise, consider disabling feature selection or providing a custom
+            feature selector."""
+            warnings.warn(message, UserWarning, stacklevel=2)
+        if not self.conf_xgboost:
+            message = """No XgboostTuneParamsConfig has been provided. Falling back to default values. Default values
+            have been chosen to speed up the prototyping. For robust hyperparameter tuning consider providing a custom
+            XgboostTuneParamsConfig with a deeper hyperparameter search space and a custom TrainingConfig to enable
+            cross-validation."""
+            warnings.warn(message, UserWarning, stacklevel=2)
+        if (
+            self.conf_training.min_features_to_select >= len(df.columns)
+            and self.conf_training.enable_feature_selection
+        ):
+            message = """The minimum number of features to select is greater or equal to the number of features in
+            the dataset while feature selection is enabled. Consider reducing the minimum number of features to
+            select or disabling feature selection via TrainingConfig."""
+            warnings.warn(message, UserWarning, stacklevel=2)
+        if self.target_column in df.columns:
+            message = """The target column is present in the dataset. Consider removing the target column from the
+            dataset to prevent leakage."""
+            warnings.warn(message, UserWarning, stacklevel=2)
+
     def fit(self, df: pd.DataFrame, target_col: str) -> None:
         """Train a full ML pipeline."""
         check_gpu_support()
@@ -116,19 +162,7 @@ def fit(self, df: pd.DataFrame, target_col: str) -> None:
         if not self.conf_training:
             self.conf_training = TrainingConfig()
 
-        if not self.conf_training.enable_feature_selection:
-            message = """Feature selection is disabled. Update the TrainingConfig param 'enable_feature_selection'
-            to enable it or make use of a custom preprocessor to do it manually during the last mile computations step.
-            Feature selection is recommended for datasets with many features (>1000). For datasets with a small amount
-            of features feature selection is not recommended.
-            """
-            warnings.warn(message, UserWarning, stacklevel=2)
-
-        if self.conf_training.hypertuning_cv_folds == 1:
-            message = """Cross validation is disabled. Update the TrainingConfig param 'hypertuning_cv_folds'
-            to enable it. Cross validation is disabled on default to allow fast prototyping. For robust hyperparameter
-            tuning using at least 5 folds is recommended."""
-            warnings.warn(message, UserWarning, stacklevel=2)
+        self.initial_checks(df)
 
         x_train, x_test, y_train, y_test = train_test_split(
             df,
@@ -181,17 +215,19 @@ def fit(self, df: pd.DataFrame, target_col: str) -> None:
                 x_test, y_test, predicton_mode=False
             )
 
-        if not self.conf_feature_selection:
-            self.conf_feature_selection = FeatureSelectionConfig()
+        if not self.custom_feature_selector:
+            self.custom_feature_selector = RFECVSelector(
+                random_state=self.conf_training.global_random_state,
+                min_features_to_select=self.conf_training.min_features_to_select,
+            )
 
         if self.conf_training.enable_feature_selection:
-            self.feature_selector = FeatureSelector(
-                selection_strategy=self.conf_feature_selection.selection_strategy
+            x_train, y_train = self.custom_feature_selector.fit_transform(
+                x_train, y_train
+            )
+            x_test, _ = self.custom_feature_selector.transform(
+                x_test, predicton_mode=False
             )
-
-        if self.feature_selector and self.conf_training.enable_feature_selection:
-            x_train = self.feature_selector.fit_transform(x_train, y_train)
-            x_test = self.feature_selector.transform(x_test)
 
         if not self.ml_model:
             self.ml_model = XgboostModel(
@@ -223,7 +259,7 @@ def fit_eval(
         """
         self.fit(df, target_col)
         y_probs, y_classes = self.predict(df_eval)
-        eval_dict = eval_classifier(target_eval.values, y_classes)
+        eval_dict = eval_classifier(target_eval.values, y_probs, y_classes)
         return eval_dict
 
     def transform_new_data(self, df: pd.DataFrame) -> pd.DataFrame:
@@ -232,6 +268,9 @@ def transform_new_data(self, df: pd.DataFrame) -> pd.DataFrame:
         if not self.feat_type_detector:
             raise Exception("Feature type converter could not be found.")
 
+        if not self.conf_training:
+            raise Exception("Training configuration could not be found.")
+
         df = self.feat_type_detector.transform_feature_types(
             df, ignore_cols=[self.target_column]
         )
@@ -264,14 +303,8 @@ def transform_new_data(self, df: pd.DataFrame) -> pd.DataFrame:
         if self.custom_last_mile_computation:
             df, _ = self.custom_last_mile_computation.transform(df, predicton_mode=True)
 
-        if not self.conf_feature_selection:
-            self.conf_feature_selection = FeatureSelectionConfig()
-
-        if not self.conf_training:
-            self.conf_training = TrainingConfig()
-
-        if self.feature_selector and self.conf_training.enable_feature_selection:
-            df = self.feature_selector.transform(df)
+        if self.custom_feature_selector and self.conf_training.enable_feature_selection:
+            df, _ = self.custom_feature_selector.transform(df, predicton_mode=True)
 
         return df
 
@@ -290,7 +323,7 @@ def predict(self, df: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray]:
         check_gpu_support()
         df = self.transform_new_data(df)
 
-        print("Predicting...")
+        logger(f"{datetime.utcnow()}: Predicting...")
         y_probs, y_classes = self.ml_model.predict(df)
 
         if self.feat_type_detector.cat_columns:

diff --git a/bluecast/config/training_config.py b/bluecast/config/training_config.py
@@ -7,11 +7,7 @@
 """
 from typing import Dict, Optional
 
-import xgboost as xgb
 from pydantic.dataclasses import dataclass
-from sklearn.feature_selection import RFECV
-from sklearn.metrics import make_scorer, matthews_corrcoef
-from sklearn.model_selection import StratifiedKFold
 
 
 class Config:
@@ -34,20 +30,7 @@ class TrainingConfig:
     train_size: float = 0.8
     train_split_stratify: bool = True
     use_full_data_for_final_model: bool = True
-
-
-@dataclass(config=Config)
-class FeatureSelectionConfig:
-    """Define feature selection parameters."""
-
-    selection_strategy: RFECV = RFECV(
-        estimator=xgb.XGBClassifier(),
-        step=1,
-        cv=StratifiedKFold(5, random_state=0, shuffle=True),
-        min_features_to_select=1,
-        scoring=make_scorer(matthews_corrcoef),
-        n_jobs=4,
-    )
+    min_features_to_select: int = 5
 
 
 @dataclass

diff --git a/bluecast/evaluation/eval_metrics.py b/bluecast/evaluation/eval_metrics.py
@@ -28,34 +28,30 @@ def balanced_log_loss(y_true, y_pred):
     return (l0.mean() + l1.mean()) / 2
 
 
-def eval_classifier(y_true: np.ndarray, y_classes: np.ndarray) -> Dict[str, Any]:
+def eval_classifier(
+    y_true: np.ndarray, y_probs: np.ndarray, y_classes: np.ndarray
+) -> Dict[str, Any]:
     try:
         matthews = matthews_corrcoef(y_true, y_classes)
     except Exception:
         matthews = 0
 
-    print(f"The Matthew correlation is {matthews}")
     logger(f"The Matthew correlation is {matthews}")
-    print("-------------------")
     accuracy = accuracy_score(y_true, y_classes)
-    print(f"The accuracy is {accuracy}")
+    logger(f"The accuracy is {accuracy}")
     recall = recall_score(y_true, y_classes, average="weighted")
-    print(f"The recall is {recall}")
+    logger(f"The recall is {recall}")
     f1_score_macro = f1_score(y_true, y_classes, average="macro", zero_division=0)
-    print(f"The macro F1 score is {f1_score_macro}")
     logger(f"The macro F1 score is {f1_score_macro}")
     f1_score_micro = f1_score(y_true, y_classes, average="micro", zero_division=0)
-    print(f"The micro F1 score is {f1_score_micro}")
     logger(f"The micro F1 score is {f1_score_micro}")
     f1_score_weighted = f1_score(y_true, y_classes, average="weighted", zero_division=0)
-    print(f"The weighted F1 score is {f1_score_weighted}")
     logger(f"The weighted F1 score is {f1_score_weighted}")
-    bll = balanced_log_loss(y_true, y_classes)
-    print(f"The balanced logloss is {bll}")
+    bll = balanced_log_loss(y_true, y_probs)
     logger(f"The balanced logloss is {bll}")
 
     full_classification_report = classification_report(y_true, y_classes)
-    print(full_classification_report)
+    logger(full_classification_report)
 
     evaluation_scores = {
         "matthews": matthews,