Update ReadMe to also showcase EDA capabilities

ThomasMeissnerDS · Jun 30, 2023 · 6c92671 · 6c92671
1 parent b04bb39
commit 6c92671
Show file tree

Hide file tree

Showing 8 changed files with 81 additions and 31 deletions.
diff --git a/README.md b/README.md
@@ -32,8 +32,9 @@ the full documentation [here](https://bluecast.readthedocs.io/en/latest/).
 * [General usage](#general-usage)
   * [Basic usage](#basic-usage)
   * [Advanced usage](#advanced-usage)
+    * [Explanatory analysis](#explanatory-analysis)
+    * [Enable cross-validation](#enable-cross-validation)
     * [Categorical encoding](#categorical-encoding)
-    * [Custom training configuration](#custom--training-configuration)
     * [Custom preprocessing](#custom-preprocessing)
     * [Custom feature selection](#custom-feature-selection)
     * [Custom ML model](#custom-ml-model)
@@ -89,29 +90,52 @@ y_probs, y_classes = automl.predict(df_val)
 
 ### Advanced usage
 
+#### Explanatory analysis
+
+BlueCast offers a simple way to get a first overview of the data. This is
+
 #### Enable cross-validation
 
-While the default behaviour of BlueCast is to use a simple 
+While the default behaviour of BlueCast is to use a simple
 train-test-split, cross-validation can be enabled easily:
 
 ```sh
-from bluecast.blueprints.cast import BlueCast
-from bluecast.config.training_config import TrainingConfig, XgboostTuneParamsConfig
+from bluecast.eda.analyse import (
+    bi_variate_plots,
+    correlation_heatmap,
+    correlation_to_target,
+    univariate_plots,
+)
 
+from bluecast.preprocessing.feature_types import FeatureTypeDetector
 
-# Create a custom training config and adjust general training parameters
-train_config = TrainingConfig()
-train_config.hypertuning_cv_folds = 5 # default is 1
+# Here we automatically detect the numeric columns
+feat_type_detector = FeatureTypeDetector()
+train_data = feat_type_detector.fit_transform_feature_types(train_data)
 
-# Pass the custom configs to the BlueCast class
-automl = BlueCast(
-        class_problem="binary",
-        target_column="target"
-        conf_training=train_config,
+# show univariate plots
+univariate_plots(
+        train_data.loc[
+            :, feat_type_detector.num_columns  # here the target column EC1 is already included
+        ],
+        "EC1",
     )
 
-automl.fit(df_train, target_col="target")
-y_probs, y_classes = automl.predict(df_val)
+# show bi-variate plots
+bi_variate_plots(train_data.loc[
+            :, feat_type_detector.num_columns
+        ],
+        "EC1")
+
+# show correlation heatmap
+correlation_heatmap(train_data.loc[
+            :, feat_type_detector.num_columns])
+
+# show correlation to target
+correlation_to_target(train_data.loc[
+            :, feat_type_detector.num_columns
+        ],
+        "EC1",)
 ```
 
 #### Categorical encoding

diff --git a/bluecast/config/training_config.py b/bluecast/config/training_config.py
@@ -39,7 +39,7 @@ class XgboostTuneParamsConfig:
     """Define hyperparameter tuning search space."""
 
     max_depth_min: int = 2
-    max_depth_max: int = 3
+    max_depth_max: int = 6
     alpha_min: float = 0.0
     alpha_max: float = 10.0
     lambda_min: float = 0.0

diff --git a/bluecast/ml_modelling/xgboost.py b/bluecast/ml_modelling/xgboost.py
@@ -189,7 +189,9 @@ def objective(trial):
                     "lambda", self.conf_xgboost.lambda_min, self.conf_xgboost.lambda_max
                 ),
                 "min_child_weight": trial.suggest_float(
-                    "min_child_weight", self.conf_xgboost.min_child_weight_min, self.conf_xgboost.min_child_weight_max
+                    "min_child_weight",
+                    self.conf_xgboost.min_child_weight_min,
+                    self.conf_xgboost.min_child_weight_max,
                 ),
                 "num_leaves": trial.suggest_int(
                     "num_leaves",
@@ -268,7 +270,7 @@ def objective(trial):
 
         algorithm = "xgboost"
         sampler = optuna.samplers.TPESampler(
-            multivariate=True, seed=self.conf_training.global_random_state
+            multivariate=False, seed=self.conf_training.global_random_state
         )
         study = optuna.create_study(
             direction="minimize",

diff --git a/dist/bluecast-0.6.11.tar.gz b/dist/bluecast-0.6.11.tar.gz
diff --git a/dist/bluecast-0.6.11-py3-none-any.whl → dist/bluecast-0.7-py3-none-any.whl b/dist/bluecast-0.6.11-py3-none-any.whl → dist/bluecast-0.7-py3-none-any.whl
diff --git a/dist/bluecast-0.7.tar.gz b/dist/bluecast-0.7.tar.gz
diff --git a/docs/source/index.md b/docs/source/index.md
@@ -32,8 +32,9 @@ the full documentation [here](https://bluecast.readthedocs.io/en/latest/).
 * [General usage](#general-usage)
   * [Basic usage](#basic-usage)
   * [Advanced usage](#advanced-usage)
+    * [Explanatory analysis](#explanatory-analysis)
+    * [Enable cross-validation](#enable-cross-validation)
     * [Categorical encoding](#categorical-encoding)
-    * [Custom training configuration](#custom--training-configuration)
     * [Custom preprocessing](#custom-preprocessing)
     * [Custom feature selection](#custom-feature-selection)
     * [Custom ML model](#custom-ml-model)
@@ -89,29 +90,52 @@ y_probs, y_classes = automl.predict(df_val)
 
 ### Advanced usage
 
+#### Explanatory analysis
+
+BlueCast offers a simple way to get a first overview of the data. This is
+
 #### Enable cross-validation
 
 While the default behaviour of BlueCast is to use a simple
 train-test-split, cross-validation can be enabled easily:
 
 ```sh
-from bluecast.blueprints.cast import BlueCast
-from bluecast.config.training_config import TrainingConfig, XgboostTuneParamsConfig
+from bluecast.eda.analyse import (
+    bi_variate_plots,
+    correlation_heatmap,
+    correlation_to_target,
+    univariate_plots,
+)
 
+from bluecast.preprocessing.feature_types import FeatureTypeDetector
 
-# Create a custom training config and adjust general training parameters
-train_config = TrainingConfig()
-train_config.hypertuning_cv_folds = 5 # default is 1
+# Here we automatically detect the numeric columns
+feat_type_detector = FeatureTypeDetector()
+train_data = feat_type_detector.fit_transform_feature_types(train_data)
 
-# Pass the custom configs to the BlueCast class
-automl = BlueCast(
-        class_problem="binary",
-        target_column="target"
-        conf_training=train_config,
+# show univariate plots
+univariate_plots(
+        train_data.loc[
+            :, feat_type_detector.num_columns  # here the target column EC1 is already included
+        ],
+        "EC1",
     )
 
-automl.fit(df_train, target_col="target")
-y_probs, y_classes = automl.predict(df_val)
+# show bi-variate plots
+bi_variate_plots(train_data.loc[
+            :, feat_type_detector.num_columns
+        ],
+        "EC1")
+
+# show correlation heatmap
+correlation_heatmap(train_data.loc[
+            :, feat_type_detector.num_columns])
+
+# show correlation to target
+correlation_to_target(train_data.loc[
+            :, feat_type_detector.num_columns
+        ],
+        "EC1",)
 ```
 
 #### Categorical encoding

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "bluecast"
-version = "0.6.11"
+version = "0.7"
 description = "A lightweight and fast automl framework"
 authors = ["Thomas Meißner <[email protected]>"]
 license = "GPL-3.0-only"