Update ReadMe

ThomasMeissnerDS · Oct 16, 2023 · 7e86137 · 7e86137
1 parent 2076d33
commit 7e86137
Show file tree

Hide file tree

Showing 4 changed files with 83 additions and 3 deletions.
diff --git a/README.md b/README.md
@@ -32,6 +32,7 @@ the full documentation [here](https://bluecast.readthedocs.io/en/latest/).
   * [Advanced usage](#advanced-usage)
     * [Explanatory analysis](#explanatory-analysis)
     * [Enable cross-validation](#enable-cross-validation)
+    * [Gaining extra performance](#gaining-extra-performance)
     * [Use multi-model blended pipeline](#use-multi-model-blended-pipeline)
     * [Categorical encoding](#categorical-encoding)
     * [Custom training configuration](#custom--training-configuration)
@@ -182,6 +183,43 @@ automl.fit(df_train, target_col="target")
 y_probs, y_classes = automl.predict(df_val)
 ```
 
+#### Gaining extra performance
+
+By deault BlueCast uses Optuna's Bayesian hyperparameter optimization.
+However Bayesian methods give an estimate an do not necessarly find
+the ideal spot. Thus BlueCast has an optional GridSearch setting
+that allows BlueCast to refine some of the parameters Optuna has found.
+
+```sh
+from bluecast.blueprints.cast import BlueCast
+from bluecast.config.training_config import TrainingConfig
+
+
+# Create a custom training config and adjust general training parameters
+train_config = TrainingConfig()
+train_config.hypertuning_cv_folds = 5 # default is 1
+train_config.enable_grid_search_fine_tuning = True # default is False
+
+# Pass the custom configs to the BlueCast class
+automl = BlueCast(
+        class_problem="binary",
+        target_column="target"
+        conf_training=train_config,
+    )
+
+automl.fit(df_train, target_col="target")
+y_probs, y_classes = automl.predict(df_val)
+```
+
+This comes with a tradeoff of longer runtime. This behaviour can be further
+controlled with two parameters:
+
+* `gridsearch_nb_parameters_per_grid`: Decides how
+  many steps the grid shall have per parameter
+* `gridsearch_tuning_max_runtime_secs`: Sets the maximum time in seconds
+  the tuning shall run. This will finish the latest trial nd will exceed
+  this limit though.
+
 #### Use multi-model blended pipeline
 
 By default, BlueCast trains a single model. However, it is possible to

diff --git a/bluecast/config/training_config.py b/bluecast/config/training_config.py
@@ -42,6 +42,9 @@ class TrainingConfig(BaseModel):
     :param show_detailed_tuning_logs: Whether to show detailed tuning logs. Not used when custom ML model is passed.
     :param enable_grid_search_fine_tuning: After hyperparameter tuning run Gridsearch tuning on a fine-grained grid
         based on the previous hyperparameter tuning. Only possible when autotune_model is True.
+    :param gridsearch_nb_parameters_per_grid: Decides how many steps the grid shall have per parameter.
+    :param gridsearch_tuning_max_runtime_secs: Sets the maximum time in seconds the tuning shall run. This will finish
+        the latest trial nd will exceed this limit though.
     :param experiment_name: Name of the experiment. Will be logged inside the ExperimentTracker.
     """
 
@@ -63,6 +66,7 @@ class TrainingConfig(BaseModel):
     optuna_sampler_n_startup_trials: int = 10
     enable_grid_search_fine_tuning: bool = False
     gridsearch_tuning_max_runtime_secs: int = 3600
+    gridsearch_nb_parameters_per_grid: int = 5
     experiment_name: str = "new experiment"
 
 

diff --git a/bluecast/ml_modelling/xgboost.py b/bluecast/ml_modelling/xgboost.py
@@ -531,19 +531,19 @@ def objective(trial):
                     self.conf_params_xgboost.params["alpha"]
                     * 0.9,  # TODO: fix design flaw in config and get rid of nested dict
                     self.conf_params_xgboost.params["alpha"] * 1.1,
-                    5,
+                    self.conf_training.gridsearch_nb_parameters_per_grid,
                     dtype=float,
                 ),
                 "lambda": np.linspace(
                     self.conf_params_xgboost.params["lambda"] * 0.9,
                     self.conf_params_xgboost.params["lambda"] * 1.1,
-                    5,
+                    self.conf_training.gridsearch_nb_parameters_per_grid,
                     dtype=float,
                 ),
                 "eta": np.linspace(
                     self.conf_params_xgboost.params["eta"] * 0.9,
                     self.conf_params_xgboost.params["eta"] * 1.1,
-                    5,
+                    self.conf_training.gridsearch_nb_parameters_per_grid,
                     dtype=float,
                 ),
             }

diff --git a/docs/source/index.md b/docs/source/index.md
@@ -32,6 +32,7 @@ the full documentation [here](https://bluecast.readthedocs.io/en/latest/).
   * [Advanced usage](#advanced-usage)
     * [Explanatory analysis](#explanatory-analysis)
     * [Enable cross-validation](#enable-cross-validation)
+    * [Gaining extra performance](#gaining-extra-performance)
     * [Use multi-model blended pipeline](#use-multi-model-blended-pipeline)
     * [Categorical encoding](#categorical-encoding)
     * [Custom training configuration](#custom--training-configuration)
@@ -182,6 +183,43 @@ automl.fit(df_train, target_col="target")
 y_probs, y_classes = automl.predict(df_val)
 ```
 
+#### Gaining extra performance
+
+By deault BlueCast uses Optuna's Bayesian hyperparameter optimization.
+However Bayesian methods give an estimate an do not necessarly find
+the ideal spot. Thus BlueCast has an optional GridSearch setting
+that allows BlueCast to refine some of the parameters Optuna has found.
+
+```sh
+from bluecast.blueprints.cast import BlueCast
+from bluecast.config.training_config import TrainingConfig
+
+
+# Create a custom training config and adjust general training parameters
+train_config = TrainingConfig()
+train_config.hypertuning_cv_folds = 5 # default is 1
+train_config.enable_grid_search_fine_tuning = True # default is False
+
+# Pass the custom configs to the BlueCast class
+automl = BlueCast(
+        class_problem="binary",
+        target_column="target"
+        conf_training=train_config,
+    )
+
+automl.fit(df_train, target_col="target")
+y_probs, y_classes = automl.predict(df_val)
+```
+
+This comes with a tradeoff of longer runtime. This behaviour can be further
+controlled with two parameters:
+
+* `gridsearch_nb_parameters_per_grid`: Decides how
+  many steps the grid shall have per parameter
+* `gridsearch_tuning_max_runtime_secs`: Sets the maximum time in seconds
+  the tuning shall run. This will finish the latest trial nd will exceed
+  this limit though.
+
 #### Use multi-model blended pipeline
 
 By default, BlueCast trains a single model. However, it is possible to