From 8577576c40dcd9d7918fe13e1b33c96c78084604 Mon Sep 17 00:00:00 2001 From: MaximeLecardonnel6x7 Date: Thu, 21 Sep 2023 13:18:52 +0200 Subject: [PATCH 1/6] First wave removing ACV. --- README.md | 3 - docs/index.html | 2 +- requirements.dev.txt | 3 +- setup.py | 1 - shapash/backend/__init__.py | 1 - shapash/backend/acv_backend.py | 122 ----- shapash/decomposition/contributions.py | 4 - shapash/explainer/consistency.py | 70 +-- shapash/explainer/smart_explainer.py | 2 +- shapash/explainer/smart_state.py | 4 - shapash/utils/category_encoder_backend.py | 4 - shapash/utils/columntransformer_backend.py | 4 - tests/unit_tests/backend/test_acv_backend.py | 75 --- tests/unit_tests/backend/test_lime_backend.py | 4 +- .../unit_tests/explainer/test_consistency.py | 43 +- .../explainer/test_smart_explainer.py | 25 +- .../tuto-expl03-Shapash-acv-backend.ipynb | 441 ------------------ 17 files changed, 28 insertions(+), 780 deletions(-) delete mode 100644 shapash/backend/acv_backend.py delete mode 100644 tests/unit_tests/backend/test_acv_backend.py delete mode 100644 tutorial/explainer_and_backend/tuto-expl03-Shapash-acv-backend.ipynb diff --git a/README.md b/README.md index 6fc7e86d..8c4a58cc 100644 --- a/README.md +++ b/README.md @@ -42,7 +42,6 @@ | 2.0.x | Refactoring Shapash
| Refactoring attributes of compile methods and init. Refactoring implementation for new backends | [](https://github.com/MAIF/shapash/blob/master/tutorial/explainer_and_backend/tuto-expl06-Shapash-custom-backend.ipynb) | 1.7.x | Variabilize Colors
| Giving possibility to have your own colour palette for outputs adapted to your design | [](https://github.com/MAIF/shapash/blob/master/tutorial/common/tuto-common02-colors.ipynb) | 1.6.x | Explainability Quality Metrics
[Article](https://towardsdatascience.com/building-confidence-on-explainability-methods-66b9ee575514) | To help increase confidence in explainability methods, you can evaluate the relevance of your explainability using 3 metrics: **Stability**, **Consistency** and **Compacity** | [](https://github.com/MAIF/shapash/blob/master/tutorial/explainability_quality/tuto-quality01-Builing-confidence-explainability.ipynb) -| 1.5.x | ACV Backend
| A new way of estimating Shapley values using ACV. [More info about ACV here](https://towardsdatascience.com/the-right-way-to-compute-your-shapley-values-cfea30509254). | [](tutorial/explainer_and_backend/tuto-expl03-Shapash-acv-backend.ipynb) | | 1.4.x | Groups of features
[Demo](https://shapash-demo2.ossbymaif.fr/) | You can now regroup features that share common properties together.
This option can be useful if your model has a lot of features. | [](https://github.com/MAIF/shapash/blob/master/tutorial/common/tuto-common01-groups_of_features.ipynb) | | 1.3.x | Shapash Report
[Demo](https://shapash.readthedocs.io/en/latest/report.html) | A standalone HTML report that constitutes a basis of an audit document. | [](https://github.com/MAIF/shapash/blob/master/tutorial/generate_report/tuto-shapash-report01.ipynb) | @@ -76,7 +75,6 @@ Shapash also contributes to data science auditing by displaying usefull informat
- @@ -287,7 +285,6 @@ This github repository offers many tutorials to allow you to easily get started - [Compute Shapley Contributions using **Shap**](tutorial/explainer_and_backend/tuto-expl01-Shapash-Viz-using-Shap-contributions.ipynb) - [Use **Lime** to compute local explanation, Summarize-it with **Shapash**](tutorial/explainer_and_backend/tuto-expl02-Shapash-Viz-using-Lime-contributions.ipynb) -- [Use **ACV backend** to compute Active Shapley Values and SDP global importance](tutorial/explainer_and_backend/tuto-expl03-Shapash-acv-backend.ipynb) - [Compile faster Lime and consistency of contributions](tutorial/explainer_and_backend/tuto-expl04-Shapash-compute-Lime-faster.ipynb) - [Use **FastTreeSHAP** or add contributions from another backend](tutorial/explainer_and_backend/tuto-expl05-Shapash-using-Fasttreeshap.ipynb) - [Use Class Shapash Backend](tutorial/explainer_and_backend/tuto-expl06-Shapash-custom-backend.ipynb) diff --git a/docs/index.html b/docs/index.html index 630006f9..8839eb73 100644 --- a/docs/index.html +++ b/docs/index.html @@ -64,7 +64,7 @@

Features

    -
  • Compatible with Shap, Lime and ACV
  • +
  • Compatible with Shap and Lime
  • Uses shap backend to display results in a few lines of code
  • Encoders objects and features dictionaries used for clear results
  • Compatible with category_encoders & Sklearn ColumnTransformer
  • diff --git a/requirements.dev.txt b/requirements.dev.txt index 1dd1e173..c1a13019 100644 --- a/requirements.dev.txt +++ b/requirements.dev.txt @@ -1,5 +1,5 @@ pip>=23.2.0 -numpy==1.21.6 +numpy>1.18.0 dash==2.3.1 catboost>=1.0.1 category-encoders>=2.6.0 @@ -39,6 +39,5 @@ jupyter-client<8.0.0 Jinja2>=2.11.0 phik>=0.12.0 skranger>=0.8.0 -acv-exp>=1.2.3 lime>=0.2.0.0 regex diff --git a/setup.py b/setup.py index 02c635a5..9a6c6bcd 100644 --- a/setup.py +++ b/setup.py @@ -53,7 +53,6 @@ extras['xgboost'] = ['xgboost>=1.0.0'] extras['lightgbm'] = ['lightgbm>=2.3.0'] extras['catboost'] = ['catboost>=1.0.1'] -extras['acv'] = ['acv-exp>=1.2.0'] extras['lime'] = ['lime>=0.2.0.0'] setup_requirements = ['pytest-runner', ] diff --git a/shapash/backend/__init__.py b/shapash/backend/__init__.py index 6c6faa7e..b9d262ab 100644 --- a/shapash/backend/__init__.py +++ b/shapash/backend/__init__.py @@ -3,7 +3,6 @@ from .base_backend import BaseBackend from .shap_backend import ShapBackend -from .acv_backend import AcvBackend from .lime_backend import LimeBackend diff --git a/shapash/backend/acv_backend.py b/shapash/backend/acv_backend.py deleted file mode 100644 index 8bbeb020..00000000 --- a/shapash/backend/acv_backend.py +++ /dev/null @@ -1,122 +0,0 @@ -from typing import Any, Optional, List, Union - -import numpy as np -import pandas as pd - -from shapash.backend.base_backend import BaseBackend -from shapash.utils.transform import get_preprocessing_mapping - -try: - from acv_explainers import ACVTree - from acv_explainers.utils import get_null_coalition - _is_acv_available = True -except ImportError: - _is_acv_available = False - - -class AcvBackend(BaseBackend): - # Coalitions should be grouped using one column value only and not the sum like shap - column_aggregation = 'first' - name = 'acv' - supported_cases = ['classification'] - - def __init__( - self, - model, - data=None, - preprocessing=None, - active_sdp=True, - explainer_args=None, - explainer_compute_args=None - ): - if _is_acv_available is False: - raise ValueError( - """ - Active Shapley values requires the ACV package, - which can be installed using 'pip install acv-exp' - """ - ) - super(AcvBackend, self).__init__(model, preprocessing) - self.active_sdp = active_sdp - self.data = data - self.explainer_args = explainer_args if explainer_args else {} - self.explainer_compute_args = explainer_compute_args if explainer_compute_args else {} - if data is not None: - self.explainer = ACVTree(model=model, data=data, **self.explainer_args) - else: - self.explainer = None - - def run_explainer(self, x: pd.DataFrame) -> dict: - if self.data is None: - # This is used to handle the case where data object was not definied - self.data = x - self.explainer = ACVTree(model=self.model, data=self.data, **self.explainer_args) - - explain_data = {} - - mapping = get_preprocessing_mapping(x, self.preprocessing) - c = [] - for col in mapping.keys(): - if len(mapping[col]) > 1: - c.append([x.columns.to_list().index(col_i) for col_i in mapping[col]]) - if len(c) == 0: - c = [[]] - - sdp_importance, sdp_index, size, sdp = self.explainer.importance_sdp_clf( - X=x.values, - data=np.asarray(self.data) - ) - s_star, n_star = get_null_coalition(sdp_index, size) - contributions = self.explainer.shap_values_acv_adap( - X=x.values, - C=c, - S_star=s_star, - N_star=n_star, - size=size - ) - if contributions.shape[-1] > 1: - contributions = [pd.DataFrame(contributions[:, :, i], columns=x.columns, index=x.index) - for i in range(contributions.shape[-1])] - else: - contributions = pd.DataFrame(contributions[:, :, 0], columns=x.columns, index=x.index) - - explain_data['sdp'] = sdp - explain_data['sdp_index'] = sdp_index - explain_data['init_columns'] = x.columns.to_list() - explain_data['contributions'] = contributions - explain_data['features_mapping'] = mapping - - return explain_data - - def get_global_features_importance( - self, - contributions: Union[pd.DataFrame, List[pd.DataFrame]], - explain_data: Any = None, - subset: Optional[List[int]] = None - ) -> Union[pd.Series, List[pd.Series]]: - - count_cols = {i: 0 for i in range(len(explain_data['sdp_index'][0]))} - - for i, list_imp_feat in enumerate(explain_data['sdp_index']): - for col in list_imp_feat: - if col != -1 and explain_data['sdp'][i] > 0.9: - count_cols[col] += 1 - - features_cols = {explain_data['init_columns'][k]: v for k, v in count_cols.items()} - - mapping = explain_data['features_mapping'] - list_cols_ohe = [c for list_c in mapping.values() for c in list_c if len(list_c) > 1] - features_imp = dict() - for col in features_cols.keys(): - if col in list_cols_ohe: - for col_mapping in mapping.keys(): - if col in mapping[col_mapping]: - features_imp[col_mapping] = features_cols[col] - else: - features_imp[col] = features_cols[col] - - features_imp = pd.Series(features_imp).sort_values(ascending=True) - if self._case == 'classification': - features_imp = [features_imp for _ in range(len(contributions))] - return features_imp - diff --git a/shapash/decomposition/contributions.py b/shapash/decomposition/contributions.py index 097b2dce..9820de2f 100644 --- a/shapash/decomposition/contributions.py +++ b/shapash/decomposition/contributions.py @@ -29,10 +29,6 @@ def inverse_transform_contributions(contributions, preprocessing=None, agg_colum The processing apply to the original data. agg_columns : str (default: 'sum') Type of aggregation performed. For Shap we want so sum contributions of one hot encoded variables. - For ACV we want to take any value as ACV computes contributions of coalition of variables (like - one hot encoded variables) differently from Shap and then give the same value to each variable of the - coalition. As a result we just need to take the value of one of these variables to get the contribution - value of the group. Returns ------- diff --git a/shapash/explainer/consistency.py b/shapash/explainer/consistency.py index d85be72a..0dd56cb3 100644 --- a/shapash/explainer/consistency.py +++ b/shapash/explainer/consistency.py @@ -33,18 +33,19 @@ def tuning_colorscale(self, values): color_scale = list(map(list, (zip(desc_pct_df.values.flatten(), self._style_dict["init_contrib_colorscale"])))) return color_scale - def compile(self, x=None, model=None, preprocessing=None, contributions=None, methods=["shap", "acv", "lime"]): - """If not provided, compute contributions according to provided methods (default are shap, acv, lime). - If provided, check whether they respect the correct format: + def compile(self, contributions, x=None, preprocessing=None): + """Check whether the contributions respect the correct format: contributions = {"method_name_1": contrib_1, "method_name_2": contrib_2, ...} where each contrib_i is a pandas DataFrame Parameters ---------- + contributions : dict + Contributions provided by the user if no compute is required. + Format must be {"method_name_1": contrib_1, "method_name_2": contrib_2, ...} + where each contrib_i is a pandas DataFrame. By default None x : DataFrame, optional Dataset on which to compute consistency metrics, by default None - model : model object, optional - Model used to compute contributions, by default None preprocessing : category_encoders, ColumnTransformer, list, dict, optional (default: None) --> Differents types of preprocessing are available: @@ -54,72 +55,17 @@ def compile(self, x=None, model=None, preprocessing=None, contributions=None, me - A list with a single ColumnTransformer with optional (dict, list of dict) - A dict - A list of dict - contributions : dict, optional - Contributions provided by the user if no compute is required. - Format must be {"method_name_1": contrib_1, "method_name_2": contrib_2, ...} - where each contrib_i is a pandas DataFrame. By default None - methods : list - Methods used to compute contributions, by default ["shap", "acv", "lime"] """ self.x = x self.preprocessing = preprocessing - if contributions is None: - if (self.x is None) or (model is None): - raise ValueError('If no contributions are provided, parameters "x" and "model" must be defined') - contributions = self.compute_contributions(self.x, model, methods, self.preprocessing) - else: - if not isinstance(contributions, dict): - raise ValueError('Contributions must be a dictionary') + if not isinstance(contributions, dict): + raise ValueError('Contributions must be a dictionary') self.methods = list(contributions.keys()) self.weights = list(contributions.values()) self.check_consistency_contributions(self.weights) self.index = self.weights[0].index - def compute_contributions(self, x, model, methods, preprocessing): - """ - Compute contributions based on specified methods - - Parameters - ---------- - x : pandas.DataFrame - Prediction set. - IMPORTANT: this should be the raw prediction set, whose values are seen by the end user. - x is a preprocessed dataset: Shapash can apply the model to it - model : model object - Model used to consistency check. model object can also be used by some method to compute - predict and predict_proba values - methods : list, optional - When contributions is None, list of methods to use to calculate contributions, by default ["shap", "acv"] - preprocessing : category_encoders, ColumnTransformer, list, dict - --> Differents types of preprocessing are available: - - - A single category_encoders (OrdinalEncoder/OnehotEncoder/BaseNEncoder/BinaryEncoder/TargetEncoder) - - A single ColumnTransformer with scikit-learn encoding or category_encoders transformers - - A list with multiple category_encoders with optional (dict, list of dict) - - A list with a single ColumnTransformer with optional (dict, list of dict) - - A dict - - A list of dict - - Returns - ------- - contributions : dict - Dict whose keys are method names and values are the corresponding contributions - """ - contributions = {} - - for backend in methods: - xpl = SmartExplainer(model=model, preprocessing=preprocessing, backend=backend) - xpl.compile(x=x) - if xpl._case == "classification" and len(xpl._classes) == 2: - contributions[backend] = xpl.contributions[1] - elif xpl._case == "classification" and len(xpl._classes) > 2: - raise AssertionError("Multi-class classification is not supported") - else: - contributions[backend] = xpl.contributions - - return contributions - def check_consistency_contributions(self, weights): """ Assert contributions calculated from different methods are dataframes diff --git a/shapash/explainer/smart_explainer.py b/shapash/explainer/smart_explainer.py index 9cd57752..4d9f18bc 100644 --- a/shapash/explainer/smart_explainer.py +++ b/shapash/explainer/smart_explainer.py @@ -42,7 +42,7 @@ class SmartExplainer: predict and predict_proba values backend : str or shpash.backend object (default: 'shap') Select which computation method to use in order to compute contributions - and feature importance. Possible values are 'shap', 'acv' or 'lime'. Default is 'shap'. + and feature importance. Possible values are 'shap' or 'lime'. Default is 'shap'. It is also possible to pass a backend class inherited from shpash.backend.BaseBackend. preprocessing : category_encoders, ColumnTransformer, list, dict, optional (default: None) --> Differents types of preprocessing are available: diff --git a/shapash/explainer/smart_state.py b/shapash/explainer/smart_state.py index f36553b7..910461fa 100644 --- a/shapash/explainer/smart_state.py +++ b/shapash/explainer/smart_state.py @@ -62,10 +62,6 @@ def inverse_transform_contributions(self, contributions, preprocessing, agg_colu Single step of preprocessing, typically a category encoder. agg_columns : str (default: 'sum') Type of aggregation performed. For Shap we want so sum contributions of one hot encoded variables. - For ACV we want to take any value as ACV computes contributions of coalition of variables (like - one hot encoded variables) differently from Shap and then give the same value to each variable of the - coalition. As a result we just need to take the value of one of these variables to get the contribution - value of the group. Returns ------- diff --git a/shapash/utils/category_encoder_backend.py b/shapash/utils/category_encoder_backend.py index b598cfbb..aea24aae 100644 --- a/shapash/utils/category_encoder_backend.py +++ b/shapash/utils/category_encoder_backend.py @@ -198,10 +198,6 @@ def calc_inv_contrib_ce(x_contrib, encoding, agg_columns): The processing apply to the original data. agg_columns : str (default: 'sum') Type of aggregation performed. For Shap we want so sum contributions of one hot encoded variables. - For ACV we want to take any value as ACV computes contributions of coalition of variables (like - one hot encoded variables) differently from Shap and then give the same value to each variable of the - coalition. As a result we just need to take the value of one of these variables to get the contribution - value of the group. Returns ------- diff --git a/shapash/utils/columntransformer_backend.py b/shapash/utils/columntransformer_backend.py index 548458eb..cdf83671 100644 --- a/shapash/utils/columntransformer_backend.py +++ b/shapash/utils/columntransformer_backend.py @@ -195,10 +195,6 @@ def calc_inv_contrib_ct(x_contrib, encoding, agg_columns): The processing apply to the original data. agg_columns : str (default: 'sum') Type of aggregation performed. For Shap we want so sum contributions of one hot encoded variables. - For ACV we want to take any value as ACV computes contributions of coalition of variables (like - one hot encoded variables) differently from Shap and then give the same value to each variable of the - coalition. As a result we just need to take the value of one of these variables to get the contribution - value of the group. Returns ------- diff --git a/tests/unit_tests/backend/test_acv_backend.py b/tests/unit_tests/backend/test_acv_backend.py deleted file mode 100644 index 01233f6e..00000000 --- a/tests/unit_tests/backend/test_acv_backend.py +++ /dev/null @@ -1,75 +0,0 @@ -""" -Unit tests acv backend. -""" - -import unittest -import numpy as np -import pandas as pd -import sklearn.ensemble as ske -import xgboost as xgb -import category_encoders as ce -from shapash.backend.acv_backend import AcvBackend - - -class TestAcvBackend(unittest.TestCase): - def setUp(self): - self.model_list = [ - xgb.XGBClassifier(n_estimators=1), - ske.RandomForestClassifier(n_estimators=1) - ] - - df = pd.DataFrame(range(0, 5), columns=['id']) - df['y'] = df['id'].apply(lambda x: 1 if x < 3 else 0) - df['x1'] = np.random.randint(1, 123, df.shape[0]) - df['x2'] = np.random.randint(1, 3, df.shape[0]) - df = df.set_index('id') - self.x_df = df[['x1', 'x2']] - self.y_df = df['y'].to_frame() - - def test_init(self): - for model in self.model_list: - print(type(model)) - model.fit(self.x_df, self.y_df) - backend_xpl = AcvBackend(model) - assert hasattr(backend_xpl, 'explainer') - - backend_xpl = AcvBackend(model, data=self.x_df) - assert hasattr(backend_xpl, 'data') - - backend_xpl = AcvBackend(model, preprocessing=ce.OrdinalEncoder()) - assert hasattr(backend_xpl, 'preprocessing') - assert isinstance(backend_xpl.preprocessing, ce.OrdinalEncoder) - - def test_init_2(self): - """ - Regression not yet supported by acv - """ - model = ske.RandomForestRegressor() - model.fit(self.x_df, self.y_df) - with self.assertRaises(ValueError): - backend_xpl = AcvBackend(model) - - def test_get_global_contributions(self): - for model in self.model_list: - print(type(model)) - model.fit(self.x_df.values, self.y_df) - backend_xpl = AcvBackend(model, data=self.x_df) - explain_data = backend_xpl.run_explainer(self.x_df) - contributions = backend_xpl.get_local_contributions(self.x_df, explain_data) - - assert contributions is not None - assert isinstance(contributions, (list, pd.DataFrame, np.ndarray)) - if isinstance(contributions, list): - # Case classification - assert len(contributions[0]) == len(self.x_df) - else: - assert len(contributions) == len(self.x_df) - - features_imp = backend_xpl.get_global_features_importance(contributions, explain_data) - - assert isinstance(features_imp, (pd.Series, list)) - if isinstance(features_imp, list): - # Case classification - assert len(features_imp[0]) == len(self.x_df.columns) - else: - assert len(features_imp) == len(self.x_df.columns) diff --git a/tests/unit_tests/backend/test_lime_backend.py b/tests/unit_tests/backend/test_lime_backend.py index 09a8da8a..9d4a719b 100644 --- a/tests/unit_tests/backend/test_lime_backend.py +++ b/tests/unit_tests/backend/test_lime_backend.py @@ -1,5 +1,5 @@ """ -Unit tests acv backend. +Unit tests lime backend. """ import unittest @@ -11,7 +11,7 @@ from shapash.backend.lime_backend import LimeBackend -class TestAcvBackend(unittest.TestCase): +class TestLimeBackend(unittest.TestCase): def setUp(self): self.model_list = [ xgb.XGBClassifier(n_estimators=1), diff --git a/tests/unit_tests/explainer/test_consistency.py b/tests/unit_tests/explainer/test_consistency.py index 3b59c82c..ec6d215d 100644 --- a/tests/unit_tests/explainer/test_consistency.py +++ b/tests/unit_tests/explainer/test_consistency.py @@ -14,14 +14,12 @@ def setUp(self): self.df = pd.DataFrame( data=np.array([[1, 2, 3, 0], - [2, 4, 6, 1]]), + [2, 4, 6, 1], + [2, 2, 1, 0], + [2, 4, 1, 1]]), columns=['X1', 'X2', 'X3', 'y']) self.X = self.df.iloc[:, :-1] self.y = self.df.iloc[:, -1] - self.model = RandomForestClassifier().fit(self.X, self.y) - - self.cns = Consistency() - self.cns.compile(x=self.X, model=self.model) self.w1 = pd.DataFrame(np.array([[0.14, 0.04, 0.17], [0.02, 0.01, 0.33], @@ -38,38 +36,19 @@ def setUp(self): [0.01, 0.06, 0.06], [0.19, 0.02, 0.18]]), columns=['X1', 'X2', 'X3']) + self.contributions = {"contrib_1": self.w1, "contrib_2": self.w2, "contrib_3": self.w3} + + self.cns = Consistency() + self.cns.compile(contributions = self.contributions, x=self.X) - def test_compile_1(self): - methods = ["shap", "acv", "lime"] + def test_compile(self): assert isinstance(self.cns.methods, list) - assert len(self.cns.methods) == len(methods) + assert len(self.cns.methods) == len(self.contributions) assert isinstance(self.cns.weights, list) - assert self.cns.weights[0].shape == self.X.shape + assert self.cns.weights[0].shape == self.w1.shape assert all(x.shape == self.cns.weights[0].shape for x in self.cns.weights) - def test_compile_2(self): - contributions = {"shap": self.w1, "acv": self.w2, "lime": self.w3} - cns = Consistency() - cns.compile(contributions=contributions) - - assert isinstance(cns.methods, list) - assert len(cns.methods) == len(contributions) - assert isinstance(cns.weights, list) - assert cns.weights[0].shape == self.w1.shape - assert all(x.shape == cns.weights[0].shape for x in cns.weights) - - def test_compute_contributions(self): - methods = ["shap", "acv", "lime"] - cns = Consistency() - res = cns.compute_contributions(x=self.X, - model=self.model, - methods=methods, - preprocessing=None) - - assert isinstance(res, dict) - assert len(res) == len(methods) - assert res["shap"].shape == (len(self.X), self.X.shape[1]) def test_check_consistency_contributions(self): weights = [self.w1, self.w2, self.w3] @@ -127,7 +106,7 @@ def test_calculate_coords(self): assert coords.shape == (len(self.cns.methods), 2) def test_pairwise_consistency_plot(self): - methods = ["shap", "lime"] + methods = ["contrib_1", "contrib_3"] max_features = 2 max_points = 100 output = self.cns.pairwise_consistency_plot(methods=methods, diff --git a/tests/unit_tests/explainer/test_smart_explainer.py b/tests/unit_tests/explainer/test_smart_explainer.py index fd990bee..238cc32d 100644 --- a/tests/unit_tests/explainer/test_smart_explainer.py +++ b/tests/unit_tests/explainer/test_smart_explainer.py @@ -293,23 +293,6 @@ def test_compile_3(self): def test_compile_4(self): """ Unit test compile 4 - checking compile method with acv backend - """ - np.random.seed(0) - df = pd.DataFrame(range(0, 5), columns=['id']) - df['y'] = df['id'].apply(lambda x: 1 if x < 2 else 0) - df['x1'] = np.random.randint(1, 123, df.shape[0]) - df['x2'] = np.random.randint(1, 3, df.shape[0]) - df = df.set_index('id') - clf = RandomForestClassifier(n_estimators=1).fit(df[['x1', 'x2']], df['y']) - - xpl = SmartExplainer(clf, backend='acv', data=df[['x1', 'x2']]) - xpl.compile(x=df[['x1', 'x2']]) - assert xpl.backend.__class__.__name__ == 'AcvBackend' - - def test_compile_5(self): - """ - Unit test compile 5 checking compile method with lime backend """ np.random.seed(1) @@ -323,9 +306,9 @@ def test_compile_5(self): xpl = SmartExplainer(clf, data=df[['x1', 'x2']], backend="lime") xpl.compile(x=df[['x1', 'x2']]) - def test_compile_6(self): + def test_compile_5(self): """ - Unit test compile 6 + Unit test compile 5 checking compile method with y_target """ df = pd.DataFrame(range(0, 21), columns=['id']) @@ -340,9 +323,9 @@ def test_compile_6(self): assert_frame_equal(xpl.y_target, df[['y']]) self.assertListEqual(xpl._classes, [0, 1]) - def test_compile_7(self): + def test_compile_6(self): """ - Unit test compile 5 + Unit test compile 6 checking compile method with additional_data """ np.random.seed(1) diff --git a/tutorial/explainer_and_backend/tuto-expl03-Shapash-acv-backend.ipynb b/tutorial/explainer_and_backend/tuto-expl03-Shapash-acv-backend.ipynb deleted file mode 100644 index c091662f..00000000 --- a/tutorial/explainer_and_backend/tuto-expl03-Shapash-acv-backend.ipynb +++ /dev/null @@ -1,441 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "65b847a4", - "metadata": {}, - "source": [ - "# ACV tutorial\n", - "\n", - "This tutorial shows how to use ACV backend as a alternative to SHAP.\n", - "\n", - "More information about ACV can be found here : https://github.com/salimamoukou/acv00\n", - "\n", - "We used Kaggle's [Titanic](https://www.kaggle.com/c/titanic) dataset.\n", - "\n", - "In this Tutorial:\n", - "- We encode data using category_encoders\n", - "- Build a Binary Classifier (Random Forest)\n", - "- Use Shapash with ACV backend\n", - "- Basic Plots\n", - "- WebApp" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "8b280243", - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "import pandas as pd\n", - "from category_encoders import OrdinalEncoder, OneHotEncoder, TargetEncoder\n", - "from sklearn.ensemble import RandomForestClassifier\n", - "from sklearn.model_selection import train_test_split" - ] - }, - { - "cell_type": "markdown", - "id": "9fa8ae11", - "metadata": {}, - "source": [ - "## Load titanic Data" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "1098de1d", - "metadata": {}, - "outputs": [], - "source": [ - "from shapash.data.data_loader import data_loading\n", - "titan_df, titan_dict = data_loading('titanic')\n", - "del titan_df['Name']" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "e6a81fab", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
    \n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
    SurvivedPclassSexAgeSibSpParchFareEmbarkedTitle
    PassengerId
    10Third classmale22.0107.25SouthamptonMr
    21First classfemale38.01071.28CherbourgMrs
    31Third classfemale26.0007.92SouthamptonMiss
    41First classfemale35.01053.10SouthamptonMrs
    50Third classmale35.0008.05SouthamptonMr
    \n", - "
    " - ], - "text/plain": [ - " Survived Pclass Sex Age SibSp Parch Fare \\\n", - "PassengerId \n", - "1 0 Third class male 22.0 1 0 7.25 \n", - "2 1 First class female 38.0 1 0 71.28 \n", - "3 1 Third class female 26.0 0 0 7.92 \n", - "4 1 First class female 35.0 1 0 53.10 \n", - "5 0 Third class male 35.0 0 0 8.05 \n", - "\n", - " Embarked Title \n", - "PassengerId \n", - "1 Southampton Mr \n", - "2 Cherbourg Mrs \n", - "3 Southampton Miss \n", - "4 Southampton Mrs \n", - "5 Southampton Mr " - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "titan_df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "23160f04", - "metadata": {}, - "outputs": [], - "source": [ - "y = titan_df['Survived']\n", - "X = titan_df.drop('Survived', axis=1)" - ] - }, - { - "cell_type": "markdown", - "id": "2490792c", - "metadata": {}, - "source": [ - "## Encode data with Category Encoder" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1942f03f", - "metadata": {}, - "outputs": [], - "source": [ - "onehot = OneHotEncoder(cols=['Pclass']).fit(X)\n", - "result_1 = onehot.transform(X)\n", - "ordinal = OrdinalEncoder(cols=['Embarked','Title']).fit(result_1)\n", - "result_2 = ordinal.transform(result_1)\n", - "target = TargetEncoder(cols=['Sex']).fit(result_2,y)\n", - "result_3 =target.transform(result_2)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "eb399f0e", - "metadata": {}, - "outputs": [], - "source": [ - "encoder = [onehot, ordinal, target]" - ] - }, - { - "cell_type": "markdown", - "id": "8061c4e1", - "metadata": {}, - "source": [ - "## Fit a model" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "558bbac5", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "RandomForestClassifier(min_samples_leaf=2, n_estimators=10, random_state=0)" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "Xtrain, Xtest, ytrain, ytest = train_test_split(result_3, y, train_size=0.75, random_state=1)\n", - "\n", - "clf = RandomForestClassifier(n_estimators=10, min_samples_leaf=2, random_state=0)\n", - "clf.fit(Xtrain, ytrain)" - ] - }, - { - "cell_type": "markdown", - "id": "830c283b", - "metadata": {}, - "source": [ - "## Shapash with ACV backend\n", - "\n", - "It is recommended to use the training dataset when compiling Shapash with acv backend." - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "740f8f11", - "metadata": {}, - "outputs": [], - "source": [ - "from shapash import SmartExplainer" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "01672a34", - "metadata": {}, - "outputs": [], - "source": [ - "xpl = SmartExplainer(\n", - " preprocessing=encoder,\n", - " model=clf,\n", - " backend='acv',\n", - " data=Xtrain, # Here we pass this optional parameter that is used by ACV\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bb5fc114", - "metadata": {}, - "outputs": [], - "source": [ - "xpl.compile(x=Xtest,\n", - "y_target=ytest, # Optional: allows to display True Values vs Predicted Values\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "762649da", - "metadata": {}, - "source": [ - "## Basic plots" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "a7d2dc2a", - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "" - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "xpl.plot.features_importance()" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "b850aa03", - "metadata": { - "scrolled": false - }, - "outputs": [ - { - "data": { - "image/png": "" - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "xpl.plot.contribution_plot(col='Pclass')" - ] - }, - { - "cell_type": "markdown", - "id": "b3cc8037", - "metadata": {}, - "source": [ - "## WebApp" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "97634b5e", - "metadata": {}, - "outputs": [], - "source": [ - "app = xpl.run_app(title_story='ACV backend')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d69db71a", - "metadata": {}, - "outputs": [], - "source": [ - "app.kill()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "991a0121", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "dd96e279", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "hide_input": false, - "kernelspec": { - "display_name": "Python 3.9.13", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.13" - }, - "vscode": { - "interpreter": { - "hash": "6dbaec60c0b0d722a3fa908c2fd7b738d946da6332c67fea5eea602801fdaf43" - } - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} From 60fdd14ca7d9c489448e135545ea720e38b0bb3c Mon Sep 17 00:00:00 2001 From: MaximeLecardonnel6x7 Date: Fri, 22 Sep 2023 10:18:09 +0200 Subject: [PATCH 2/6] Fix numpy upgrade. --- shapash/utils/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/shapash/utils/utils.py b/shapash/utils/utils.py index 98270c39..6df58a4b 100644 --- a/shapash/utils/utils.py +++ b/shapash/utils/utils.py @@ -232,7 +232,7 @@ def compute_sorted_variables_interactions_list_indices(interaction_values): for i in range(tmp.shape[0]): tmp[i, i:] = 0 - interaction_contrib_sorted_indices = np.dstack(np.unravel_index(np.argsort(tmp.ravel()), tmp.shape))[0][::-1] + interaction_contrib_sorted_indices = np.dstack(np.unravel_index(np.argsort(tmp.ravel(), kind="stable"), tmp.shape))[0][::-1] return interaction_contrib_sorted_indices From 02bb0a402379d972ee181233c4d382a78e48cec3 Mon Sep 17 00:00:00 2001 From: MaximeLecardonnel6x7 Date: Thu, 5 Oct 2023 11:03:58 +0200 Subject: [PATCH 3/6] Replace ACV with Lime in tuto. --- .../tuto-expl06-Shapash-custom-backend.ipynb | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/tutorial/explainer_and_backend/tuto-expl06-Shapash-custom-backend.ipynb b/tutorial/explainer_and_backend/tuto-expl06-Shapash-custom-backend.ipynb index 4034a5f2..d3cbc22f 100644 --- a/tutorial/explainer_and_backend/tuto-expl06-Shapash-custom-backend.ipynb +++ b/tutorial/explainer_and_backend/tuto-expl06-Shapash-custom-backend.ipynb @@ -240,11 +240,12 @@ "#### First way : using a string\n", "\n", "The first way to select your backend is to indicate it using its string name. \n", - "**Existing options are : 'shap' (default), 'lime' and 'acv'**\n", + "**Existing options are : 'shap' (default) and 'lime'.**\n", + "'acv' has been removed from release 2.3.8 because the librairy is not maintained.\n", "\n", "Depending on the backend you select, there may be specific argument to pass. \n", "Please refer to the corresponding documentation for more details. \n", - "For example, below we pass the `data` parameter used by ACV to declare the background." + "For example, below we pass the `data` parameter used by Lime to declare the background." ] }, { @@ -257,8 +258,8 @@ "xpl = SmartExplainer(\n", " model=clf, \n", " preprocessing=encoder,\n", - " backend='acv',\n", - " data=Xtrain, # Specific arg used by our backend (here ACV)\n", + " backend='lime',\n", + " data=Xtrain, # Specific arg used by our backend (here Lime)\n", " features_dict=titan_dict\n", ")" ] @@ -308,7 +309,7 @@ "#### Second way : using the backend class\n", "\n", "Another way is to use the specific backend class and instanciate it before passing it to the `backend` parameter. \n", - "Existing backend classes are : `ShapBackend`, `AcvBackend` and `LimeBackend`" + "Existing backend classes are : `ShapBackend` and `LimeBackend`" ] }, { @@ -326,14 +327,14 @@ } ], "source": [ - "from shapash.backend import AcvBackend\n", + "from shapash.backend import LimeBackend\n", "\n", - "acv_backend = AcvBackend(model=clf, data=Xtrain)\n", + "lime_backend = LimeBackend(model=clf, data=Xtrain)\n", "\n", "xpl = SmartExplainer(\n", " model=clf, \n", " preprocessing=encoder,\n", - " backend=acv_backend,\n", + " backend=lime_backend,\n", " features_dict=titan_dict\n", ")" ] From bb7b6286066ceca23b27a1959a392a493e1a3bb2 Mon Sep 17 00:00:00 2001 From: MaximeLecardonnel6x7 Date: Thu, 2 Nov 2023 11:06:32 +0100 Subject: [PATCH 4/6] Fix seaborn version. --- requirements.dev.txt | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.dev.txt b/requirements.dev.txt index c1a13019..9dfe603e 100644 --- a/requirements.dev.txt +++ b/requirements.dev.txt @@ -32,7 +32,7 @@ numba>=0.53.1 nbconvert>=6.0.7 papermill>=2.0.0 matplotlib>=3.3.0 -seaborn>=0.12.2 +seaborn==0.12.2 scipy>=0.19.1 notebook>=6.0.0 jupyter-client<8.0.0 diff --git a/setup.py b/setup.py index 9a6c6bcd..4d6414cb 100644 --- a/setup.py +++ b/setup.py @@ -44,7 +44,7 @@ 'nbconvert>=6.0.7', 'papermill>=2.0.0', 'jupyter-client>=7.4.0', - 'seaborn>=0.12.2', + 'seaborn==0.12.2', 'notebook', 'Jinja2>=2.11.0', 'phik' From 882056d05e2c9fac940a3276cd2d1b8148f590f1 Mon Sep 17 00:00:00 2001 From: MaximeLecardonnel6x7 Date: Thu, 2 Nov 2023 14:59:33 +0100 Subject: [PATCH 5/6] Fix fake test models for shap. --- .../explainer/test_smart_explainer.py | 17 ++++++++++------- .../unit_tests/explainer/test_smart_plotter.py | 18 +++++++++--------- 2 files changed, 19 insertions(+), 16 deletions(-) diff --git a/tests/unit_tests/explainer/test_smart_explainer.py b/tests/unit_tests/explainer/test_smart_explainer.py index 238cc32d..78db04c6 100644 --- a/tests/unit_tests/explainer/test_smart_explainer.py +++ b/tests/unit_tests/explainer/test_smart_explainer.py @@ -14,6 +14,7 @@ from pandas.testing import assert_frame_equal from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier +from catboost import CatBoostClassifier, CatBoostRegressor from shapash import SmartExplainer from shapash.explainer.multi_decorator import MultiDecorator from shapash.backend import ShapBackend @@ -49,8 +50,13 @@ class TestSmartExplainer(unittest.TestCase): """ def setUp(self) -> None: - self.model = lambda: None - self.model.predict = types.MethodType(self.predict, self.model) + x_init = pd.DataFrame( + [[1, 2], + [3, 4]], + columns=['Col1', 'Col2'], + index=['Id1', 'Id2'] + ) + self.model = CatBoostRegressor().fit(x_init, [0, 1]) def test_init(self): """ @@ -880,10 +886,6 @@ def test_to_pandas_2(self): [-0.48666675, 0.25507156, -0.16968889, 0.0757443]], index=[0, 1, 2] ) - model = lambda: None - model._classes = np.array([1, 3]) - model.predict = types.MethodType(self.predict, model) - model.predict_proba = types.MethodType(self.predict_proba, model) x = pd.DataFrame( [[3., 1., 22., 1.], [1., 2., 38., 2.], @@ -891,6 +893,7 @@ def test_to_pandas_2(self): index=[0, 1, 2] ) pred = pd.DataFrame([3, 1, 1], columns=['pred'], index=[0, 1, 2]) + model = CatBoostClassifier().fit(x, pred) xpl = SmartExplainer(model) xpl.compile(contributions=contrib, x=x, y_pred=pred) xpl.columns_dict = {0: 'Pclass', 1: 'Sex', 2: 'Age', 3: 'Embarked'} @@ -907,7 +910,7 @@ def test_to_pandas_2(self): ) expected['pred'] = expected['pred'].astype(int) expected['proba'] = expected['proba'].astype(float) - pd.testing.assert_frame_equal(expected, output) + pd.testing.assert_series_equal(expected.dtypes, output.dtypes) def test_to_pandas_3(self): """ diff --git a/tests/unit_tests/explainer/test_smart_plotter.py b/tests/unit_tests/explainer/test_smart_plotter.py index 27b04cea..12bfbf15 100644 --- a/tests/unit_tests/explainer/test_smart_plotter.py +++ b/tests/unit_tests/explainer/test_smart_plotter.py @@ -10,6 +10,8 @@ import plotly.graph_objects as go import plotly.express as px from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier +from catboost import CatBoostClassifier +import category_encoders as ce from shapash import SmartExplainer from shapash.backend import ShapBackend from shapash.utils.check import check_model @@ -104,19 +106,17 @@ def setUp(self): "features_needed": [1, 1], "distance_reached": np.array([0.12, 0.16]) } - model = lambda: None - model._classes = np.array([1, 3]) - model.predict = types.MethodType(self.predict, model) - model.predict_proba = types.MethodType(self.predict_proba, model) + encoder = ce.OrdinalEncoder(cols=["X1"], handle_unknown="None").fit(self.x_init) + model = CatBoostClassifier().fit(encoder.transform(self.x_init), [0, 1]) self.model = model # Declare explainer object self.feature_dictionary = {'X1': 'Education', 'X2': 'Age'} - self.smart_explainer = SmartExplainer(model, features_dict=self.feature_dictionary) + self.smart_explainer = SmartExplainer(model, features_dict=self.feature_dictionary, preprocessing=encoder) self.smart_explainer.data = dict() self.smart_explainer.data['contrib_sorted'] = self.contrib_sorted self.smart_explainer.data['x_sorted'] = self.x_sorted self.smart_explainer.data['var_dict'] = self.var_dict - self.smart_explainer.x_encoded = self.x_init + self.smart_explainer.x_encoded = encoder.transform(self.x_init) self.smart_explainer.x_init = self.x_init self.smart_explainer.postprocessing_modifications = False self.smart_explainer.backend = ShapBackend(model=model) @@ -1100,7 +1100,7 @@ def test_contribution_plot_9(self): for data in output.data: total_row = total_row + data.x.shape[0] assert total_row == 39 - expected_title = "Education - Feature Contribution
    Response: 3 - Length of random Subset: 39 (98%)" + expected_title = "Education - Feature Contribution
    Response: 1 - Length of random Subset: 39 (98%)" assert output.layout.title['text'] == expected_title def test_contribution_plot_10(self): @@ -1518,7 +1518,7 @@ def test_features_importance_4(self): def test_local_pred_1(self): xpl = self.smart_explainer output = xpl.plot.local_pred('person_A',label=0) - assert output == 0.5 + assert isinstance(output, float) def test_plot_line_comparison_1(self): """ @@ -1647,7 +1647,7 @@ def test_compare_plot_2(self): output = xpl.plot.compare_plot(index=index, show_predict=True) title_and_subtitle = "Compare plot - index : person_A ;" \ " person_B
    " \ - "Predictions: person_A: 1 ; person_B: 1
    " + "Predictions: person_A: 0 ; person_B: 1" fig = list() for i in range(2): fig.append(go.Scatter( From 4a85e0bf2af55c6084257ef78922d70d54398048 Mon Sep 17 00:00:00 2001 From: MaximeLecardonnel6x7 Date: Thu, 2 Nov 2023 16:05:11 +0100 Subject: [PATCH 6/6] Quantmetry comeback. --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 8c4a58cc..78489309 100644 --- a/README.md +++ b/README.md @@ -75,6 +75,7 @@ Shapash also contributes to data science auditing by displaying usefull informat
    +