diff --git a/requirements.dev.txt b/requirements.dev.txt index 91de2350..457696b5 100644 --- a/requirements.dev.txt +++ b/requirements.dev.txt @@ -25,7 +25,7 @@ nbsphinx==0.8.8 sphinx_material==0.0.35 pytest>=6.2.5 pytest-cov>=2.8.1 -scikit-learn>=1.0.1,<1.4 +scikit-learn>=1.4.0 xgboost>=1.0.0 nbformat>4.2.0 numba>=0.53.1 diff --git a/setup.py b/setup.py index cfbd6f51..3dfc9325 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ "dash-table>=5.0.0", "nbformat>4.2.0", "numba>=0.53.1", - "scikit-learn>=1.0.1,<1.4", + "scikit-learn>=1.4.0", "category_encoders>=2.6.0", "scipy>=0.19.1", ] diff --git a/shapash/explainer/smart_explainer.py b/shapash/explainer/smart_explainer.py index 72489c5b..4fbda92d 100644 --- a/shapash/explainer/smart_explainer.py +++ b/shapash/explainer/smart_explainer.py @@ -240,7 +240,14 @@ def __init__( self.features_imp = None def compile( - self, x, contributions=None, y_pred=None, y_target=None, additional_data=None, additional_features_dict=None + self, + x, + contributions=None, + y_pred=None, + proba_values=None, + y_target=None, + additional_data=None, + additional_features_dict=None, ): """ The compile method is the first step to understand model and @@ -266,6 +273,11 @@ def compile( This is an interesting parameter for more explicit outputs. Shapash lets users define their own predict, as they may wish to set their own threshold (classification) + proba_values : pandas.Series or pandas.DataFrame, optional (default: None) + Probability values (1 column only). + The index must be identical to the index of x_init. + This is an interesting parameter for more explicit outputs. + Shapash lets users define their own probability values y_target : pandas.Series or pandas.DataFrame, optional (default: None) Target values (1 column only). The index must be identical to the index of x_init. @@ -291,6 +303,13 @@ def compile( x_init = inverse_transform(self.x_encoded, self.preprocessing) self.x_init = handle_categorical_missing(x_init) self.y_pred = check_y(self.x_init, y_pred, y_name="y_pred") + if (self.y_pred is None) and (hasattr(self.model, "predict")): + self.predict() + + self.proba_values = check_y(self.x_init, proba_values, y_name="proba_values") + if (self._case == "classification") and (self.proba_values is None) and (hasattr(self.model, "predict_proba")): + self.predict_proba() + self.y_target = check_y(self.x_init, y_target, y_name="y_target") self.prediction_error = predict_error(self.y_target, self.y_pred, self._case) @@ -405,6 +424,7 @@ def define_style(self, palette_name=None, colors_dict=None): def add( self, y_pred=None, + proba_values=None, y_target=None, label_dict=None, features_dict=None, @@ -423,6 +443,9 @@ def add( y_pred : pandas.Series, optional (default: None) Prediction values (1 column only). The index must be identical to the index of x_init. + proba_values : pandas.Series, optional (default: None) + Probability values (1 column only). + The index must be identical to the index of x_init. label_dict: dict, optional (default: None) Dictionary mapping integer labels to domain names. features_dict: dict, optional (default: None) @@ -446,6 +469,8 @@ def add( self.y_pred = check_y(self.x_init, y_pred, y_name="y_pred") if hasattr(self, "y_target"): self.prediction_error = predict_error(self.y_target, self.y_pred, self._case) + if proba_values is not None: + self.proba_values = check_y(self.x_init, proba_values, y_name="proba_values") if y_target is not None: self.y_target = check_y(self.x_init, y_target, y_name="y_target") if hasattr(self, "y_pred"): @@ -895,7 +920,7 @@ def to_pandas( ) # Matching with y_pred if proba: - self.predict_proba() if proba else None + self.predict_proba() proba_values = self.proba_values else: proba_values = None @@ -1006,8 +1031,6 @@ def init_app(self, settings: dict = None): Possible settings (dict keys) are 'rows', 'points', 'violin', 'features' Values should be positive ints """ - if self.y_pred is None: - self.predict() self.smartapp = SmartApp(self, settings) def run_app( @@ -1046,8 +1069,6 @@ def run_app( if title_story is not None: self.title_story = title_story - if self.y_pred is None: - self.predict() if hasattr(self, "_case"): self.smartapp = SmartApp(self, settings) if host is None: diff --git a/shapash/explainer/smart_plotter.py b/shapash/explainer/smart_plotter.py index 78ec8981..e42b447f 100644 --- a/shapash/explainer/smart_plotter.py +++ b/shapash/explainer/smart_plotter.py @@ -949,9 +949,7 @@ def local_pred(self, index, label=None): float: Predict or predict_proba value """ if self.explainer._case == "classification": - if hasattr(self.explainer.model, "predict_proba"): - if not hasattr(self.explainer, "proba_values"): - self.explainer.predict_proba() + if self.explainer.proba_values is not None: value = self.explainer.proba_values.iloc[:, [label]].loc[index].values[0] else: value = None @@ -1237,9 +1235,7 @@ def contribution_plot( col_value = self.explainer._classes[label_num] subtitle = f"Response: {label_value}" # predict proba Color scale - if proba and hasattr(self.explainer.model, "predict_proba"): - if not hasattr(self.explainer, "proba_values"): - self.explainer.predict_proba() + if proba and self.explainer.proba_values is not None: proba_values = self.explainer.proba_values.iloc[:, [label_num]] if not hasattr(self, "pred_colorscale"): self.pred_colorscale = {} @@ -3209,12 +3205,7 @@ def _prediction_classification_plot( label_num, _, label_value = self.explainer.check_label_name(label) # predict proba Color scale - if hasattr(self.explainer.model, "predict_proba"): - if not hasattr(self.explainer, "proba_values"): - self.explainer.predict_proba() - if hasattr(self.explainer.model, "predict"): - if not hasattr(self.explainer, "y_pred") or self.explainer.y_pred is None: - self.explainer.predict() + if self.explainer.proba_values is not None: # Assign proba values of the target df_proba_target = self.explainer.proba_values.copy() df_proba_target["proba_target"] = df_proba_target.iloc[:, label_num] @@ -3333,9 +3324,6 @@ def _prediction_regression_plot( fig = go.Figure() subtitle = None - if self.explainer.y_pred is None: - if hasattr(self.explainer.model, "predict"): - self.explainer.predict() prediction_error = self.explainer.prediction_error if prediction_error is not None: if (self.explainer.y_target == 0).any()[0]: diff --git a/shapash/utils/columntransformer_backend.py b/shapash/utils/columntransformer_backend.py index 52fe4b6b..09a0ab09 100644 --- a/shapash/utils/columntransformer_backend.py +++ b/shapash/utils/columntransformer_backend.py @@ -4,6 +4,7 @@ import numpy as np import pandas as pd +from sklearn.preprocessing import FunctionTransformer from shapash.utils.category_encoder_backend import ( category_encoder_binary, @@ -91,7 +92,7 @@ def inv_transform_ct(x_in, encoding): # columns not encode elif name_encoding == "remainder": - if ct_encoding == "passthrough": + if isinstance(ct_encoding, FunctionTransformer): nb_col = len(col_encoding) frame = x_in.iloc[:, init : init + nb_col] else: @@ -249,7 +250,7 @@ def calc_inv_contrib_ct(x_contrib, encoding, agg_columns): init += nb_col elif name_encoding == "remainder": - if ct_encoding == "passthrough": + if isinstance(ct_encoding, FunctionTransformer): nb_col = len(col_encoding) frame = x_contrib.iloc[:, init : init + nb_col] rst = pd.concat([rst, frame], axis=1) @@ -366,7 +367,9 @@ def get_feature_names(column_transformer): List of returned features names when ColumnTransformer is applied. """ feature_names = [] - l_transformers = list(column_transformer._iter(fitted=True)) + l_transformers = list( + column_transformer._iter(fitted=True, column_as_labels=False, skip_drop=True, skip_empty_columns=True) + ) for name, trans, column, _ in l_transformers: feature_names.extend(get_names(name, trans, column, column_transformer)) @@ -463,11 +466,8 @@ def get_col_mapping_ct(encoder, x_encoded): else: raise NotImplementedError(f"Estimator not supported : {estimator}") - elif estimator == "passthrough": - try: - features_out = encoder.feature_names_in_[features] - except Exception: - features_out = encoder._feature_names_in[features] # for oldest sklearn version + elif isinstance(estimator, FunctionTransformer): + features_out = encoder.feature_names_in_[features] for f_name in features_out: dict_col_mapping[f_name] = [x_encoded.columns.to_list()[idx_encoded]] idx_encoded += 1 diff --git a/shapash/utils/transform.py b/shapash/utils/transform.py index 42ddf779..37f56655 100644 --- a/shapash/utils/transform.py +++ b/shapash/utils/transform.py @@ -1,10 +1,12 @@ """ Transform Module """ + import re import numpy as np import pandas as pd +from sklearn.preprocessing import FunctionTransformer from shapash.utils.category_encoder_backend import ( get_col_mapping_ce, @@ -185,7 +187,7 @@ def check_transformers(list_encoding): if (str(type(ct_encoding)) not in supported_sklearn) and ( str(type(ct_encoding)) not in supported_category_encoder ): - if str(type(ct_encoding)) != "": + if not isinstance(ct_encoding, str) and not isinstance(ct_encoding, FunctionTransformer): raise ValueError("One of the encoders used in ColumnTransformers isn't supported.") elif str(type(enc)) in supported_category_encoder: diff --git a/tests/unit_tests/explainer/test_smart_plotter.py b/tests/unit_tests/explainer/test_smart_plotter.py index 57f279a0..afd19220 100644 --- a/tests/unit_tests/explainer/test_smart_plotter.py +++ b/tests/unit_tests/explainer/test_smart_plotter.py @@ -108,6 +108,7 @@ def setUp(self): self.smart_explainer._case, self.smart_explainer._classes = check_model(model) self.smart_explainer.state = MultiDecorator(SmartState()) self.smart_explainer.y_pred = None + self.smart_explainer.proba_values = None self.smart_explainer.features_desc = dict(self.x_init.nunique()) self.smart_explainer.features_compacity = self.features_compacity @@ -863,7 +864,7 @@ def test_contribution_plot_8(self): xpl.model = model np_hv = [f"Id: {x}
Predict: {y}" for x, y in zip(xpl.x_init.index, xpl.y_pred.iloc[:, 0].tolist())] np_hv.sort() - output = xpl.plot.contribution_plot(col) + output = xpl.plot.contribution_plot(col, proba=False) annot_list = [] for data_plot in output.data: annot_list.extend(data_plot.hovertext.tolist()) @@ -895,7 +896,7 @@ def test_contribution_plot_9(self): model = lambda: None model.classes_ = np.array([0, 1]) xpl.model = model - output = xpl.plot.contribution_plot(col, max_points=39) + output = xpl.plot.contribution_plot(col, max_points=39, proba=False) assert len(output.data) == 4 for elem in output.data: assert elem.type == "violin" @@ -1266,6 +1267,9 @@ def test_features_importance_4(self): def test_local_pred_1(self): xpl = self.smart_explainer + xpl.proba_values = pd.DataFrame( + data=np.array([[0.4, 0.6], [0.3, 0.7]]), columns=["class_1", "class_2"], index=xpl.x_encoded.index.values + ) output = xpl.plot.local_pred("person_A", label=0) assert isinstance(output, float) diff --git a/tests/unit_tests/utils/test_columntransformer_backend.py b/tests/unit_tests/utils/test_columntransformer_backend.py index 15541d6e..78a44d5f 100644 --- a/tests/unit_tests/utils/test_columntransformer_backend.py +++ b/tests/unit_tests/utils/test_columntransformer_backend.py @@ -1,6 +1,7 @@ """ Unit test of Inverse Transform """ + import unittest import catboost as cb @@ -959,25 +960,25 @@ def test_get_names_1(self): enc_4.fit(train) feature_names_1 = [] - l_transformers = list(enc_1._iter(fitted=True)) + l_transformers = list(enc_1._iter(fitted=True, column_as_labels=False, skip_drop=True, skip_empty_columns=True)) for name, trans, column, _ in l_transformers: feature_names_1.extend(get_names(name, trans, column, enc_1)) feature_names_2 = [] - l_transformers = list(enc_2._iter(fitted=True)) + l_transformers = list(enc_2._iter(fitted=True, column_as_labels=False, skip_drop=True, skip_empty_columns=True)) for name, trans, column, _ in l_transformers: feature_names_2.extend(get_names(name, trans, column, enc_2)) feature_names_3 = [] - l_transformers = list(enc_3._iter(fitted=True)) + l_transformers = list(enc_3._iter(fitted=True, column_as_labels=False, skip_drop=True, skip_empty_columns=True)) for name, trans, column, _ in l_transformers: feature_names_3.extend(get_names(name, trans, column, enc_3)) feature_names_4 = [] - l_transformers = list(enc_4._iter(fitted=True)) + l_transformers = list(enc_4._iter(fitted=True, column_as_labels=False, skip_drop=True, skip_empty_columns=True)) for name, trans, column, _ in l_transformers: feature_names_4.extend(get_names(name, trans, column, enc_4))