diff --git a/README.md b/README.md index ded4ef98..e01ffe05 100644 --- a/README.md +++ b/README.md @@ -32,8 +32,9 @@ the full documentation [here](https://bluecast.readthedocs.io/en/latest/). * [General usage](#general-usage) * [Basic usage](#basic-usage) * [Advanced usage](#advanced-usage) + * [Explanatory analysis](#explanatory-analysis) + * [Enable cross-validation](#enable-cross-validation) * [Categorical encoding](#categorical-encoding) - * [Custom training configuration](#custom--training-configuration) * [Custom preprocessing](#custom-preprocessing) * [Custom feature selection](#custom-feature-selection) * [Custom ML model](#custom-ml-model) @@ -89,29 +90,52 @@ y_probs, y_classes = automl.predict(df_val) ### Advanced usage +#### Explanatory analysis + +BlueCast offers a simple way to get a first overview of the data. This is + #### Enable cross-validation -While the default behaviour of BlueCast is to use a simple +While the default behaviour of BlueCast is to use a simple train-test-split, cross-validation can be enabled easily: ```sh -from bluecast.blueprints.cast import BlueCast -from bluecast.config.training_config import TrainingConfig, XgboostTuneParamsConfig +from bluecast.eda.analyse import ( + bi_variate_plots, + correlation_heatmap, + correlation_to_target, + univariate_plots, +) +from bluecast.preprocessing.feature_types import FeatureTypeDetector -# Create a custom training config and adjust general training parameters -train_config = TrainingConfig() -train_config.hypertuning_cv_folds = 5 # default is 1 +# Here we automatically detect the numeric columns +feat_type_detector = FeatureTypeDetector() +train_data = feat_type_detector.fit_transform_feature_types(train_data) -# Pass the custom configs to the BlueCast class -automl = BlueCast( - class_problem="binary", - target_column="target" - conf_training=train_config, +# show univariate plots +univariate_plots( + train_data.loc[ + :, feat_type_detector.num_columns # here the target column EC1 is already included + ], + "EC1", ) -automl.fit(df_train, target_col="target") -y_probs, y_classes = automl.predict(df_val) +# show bi-variate plots +bi_variate_plots(train_data.loc[ + :, feat_type_detector.num_columns + ], + "EC1") + +# show correlation heatmap +correlation_heatmap(train_data.loc[ + :, feat_type_detector.num_columns]) + +# show correlation to target +correlation_to_target(train_data.loc[ + :, feat_type_detector.num_columns + ], + "EC1",) ``` #### Categorical encoding diff --git a/bluecast/config/training_config.py b/bluecast/config/training_config.py index 2fc88e1e..b6c11514 100644 --- a/bluecast/config/training_config.py +++ b/bluecast/config/training_config.py @@ -39,7 +39,7 @@ class XgboostTuneParamsConfig: """Define hyperparameter tuning search space.""" max_depth_min: int = 2 - max_depth_max: int = 3 + max_depth_max: int = 6 alpha_min: float = 0.0 alpha_max: float = 10.0 lambda_min: float = 0.0 diff --git a/bluecast/ml_modelling/xgboost.py b/bluecast/ml_modelling/xgboost.py index d677a3bb..d477c32e 100644 --- a/bluecast/ml_modelling/xgboost.py +++ b/bluecast/ml_modelling/xgboost.py @@ -189,7 +189,9 @@ def objective(trial): "lambda", self.conf_xgboost.lambda_min, self.conf_xgboost.lambda_max ), "min_child_weight": trial.suggest_float( - "min_child_weight", self.conf_xgboost.min_child_weight_min, self.conf_xgboost.min_child_weight_max + "min_child_weight", + self.conf_xgboost.min_child_weight_min, + self.conf_xgboost.min_child_weight_max, ), "num_leaves": trial.suggest_int( "num_leaves", @@ -268,7 +270,7 @@ def objective(trial): algorithm = "xgboost" sampler = optuna.samplers.TPESampler( - multivariate=True, seed=self.conf_training.global_random_state + multivariate=False, seed=self.conf_training.global_random_state ) study = optuna.create_study( direction="minimize", diff --git a/dist/bluecast-0.6.11.tar.gz b/dist/bluecast-0.6.11.tar.gz deleted file mode 100644 index 845ed7eb..00000000 Binary files a/dist/bluecast-0.6.11.tar.gz and /dev/null differ diff --git a/dist/bluecast-0.6.11-py3-none-any.whl b/dist/bluecast-0.7-py3-none-any.whl similarity index 63% rename from dist/bluecast-0.6.11-py3-none-any.whl rename to dist/bluecast-0.7-py3-none-any.whl index c1a2e504..eabb3bbf 100644 Binary files a/dist/bluecast-0.6.11-py3-none-any.whl and b/dist/bluecast-0.7-py3-none-any.whl differ diff --git a/dist/bluecast-0.7.tar.gz b/dist/bluecast-0.7.tar.gz new file mode 100644 index 00000000..e9a134fc Binary files /dev/null and b/dist/bluecast-0.7.tar.gz differ diff --git a/docs/source/index.md b/docs/source/index.md index 29cb42a8..e01ffe05 100644 --- a/docs/source/index.md +++ b/docs/source/index.md @@ -32,8 +32,9 @@ the full documentation [here](https://bluecast.readthedocs.io/en/latest/). * [General usage](#general-usage) * [Basic usage](#basic-usage) * [Advanced usage](#advanced-usage) + * [Explanatory analysis](#explanatory-analysis) + * [Enable cross-validation](#enable-cross-validation) * [Categorical encoding](#categorical-encoding) - * [Custom training configuration](#custom--training-configuration) * [Custom preprocessing](#custom-preprocessing) * [Custom feature selection](#custom-feature-selection) * [Custom ML model](#custom-ml-model) @@ -89,29 +90,52 @@ y_probs, y_classes = automl.predict(df_val) ### Advanced usage +#### Explanatory analysis + +BlueCast offers a simple way to get a first overview of the data. This is + #### Enable cross-validation While the default behaviour of BlueCast is to use a simple train-test-split, cross-validation can be enabled easily: ```sh -from bluecast.blueprints.cast import BlueCast -from bluecast.config.training_config import TrainingConfig, XgboostTuneParamsConfig +from bluecast.eda.analyse import ( + bi_variate_plots, + correlation_heatmap, + correlation_to_target, + univariate_plots, +) +from bluecast.preprocessing.feature_types import FeatureTypeDetector -# Create a custom training config and adjust general training parameters -train_config = TrainingConfig() -train_config.hypertuning_cv_folds = 5 # default is 1 +# Here we automatically detect the numeric columns +feat_type_detector = FeatureTypeDetector() +train_data = feat_type_detector.fit_transform_feature_types(train_data) -# Pass the custom configs to the BlueCast class -automl = BlueCast( - class_problem="binary", - target_column="target" - conf_training=train_config, +# show univariate plots +univariate_plots( + train_data.loc[ + :, feat_type_detector.num_columns # here the target column EC1 is already included + ], + "EC1", ) -automl.fit(df_train, target_col="target") -y_probs, y_classes = automl.predict(df_val) +# show bi-variate plots +bi_variate_plots(train_data.loc[ + :, feat_type_detector.num_columns + ], + "EC1") + +# show correlation heatmap +correlation_heatmap(train_data.loc[ + :, feat_type_detector.num_columns]) + +# show correlation to target +correlation_to_target(train_data.loc[ + :, feat_type_detector.num_columns + ], + "EC1",) ``` #### Categorical encoding diff --git a/pyproject.toml b/pyproject.toml index ca9d8688..513bced0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "bluecast" -version = "0.6.11" +version = "0.7" description = "A lightweight and fast automl framework" authors = ["Thomas Meißner "] license = "GPL-3.0-only"