diff --git a/CHANGELOG.md b/CHANGELOG.md index aa882862..3e2d6ba0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,11 @@ - Example: 10.2.1.4 is the 5th version that supports khiops 10.2.1. - Internals: Changes in *Internals* sections are unlikely to be of interest for data scientists. +## 10.2.2.4 - 2024-08-05 + +## Added +- (`sklearn`) Sklearn's attributes for supervised estimators. + ## 10.2.2.3 - 2024-08-02 ### Fixed diff --git a/doc/samples/samples_sklearn.rst b/doc/samples/samples_sklearn.rst index c2358cfd..b694caef 100644 --- a/doc/samples/samples_sklearn.rst +++ b/doc/samples/samples_sklearn.rst @@ -68,6 +68,14 @@ Samples # Train the classifier khc.fit(X_train, y_train) + # Show the feature importance info + print(f"Features evaluated: {khc.n_features_evaluated_}") + print(f"Features selected : {khc.n_features_used_}") + print("Top 3 used features") + for i, feature in enumerate(khc.feature_used_names_[:3]): + print(f"{feature} - Importance: {khc.feature_used_importances_[i][2]}") + print("---") + # Predict the classes on the test dataset y_test_pred = khc.predict(X_test) print("Predicted classes (first 10):") @@ -273,6 +281,14 @@ Samples khc = KhiopsClassifier(n_trees=0) khc.fit(X, y) + # Show the feature importance info + print(f"Features evaluated: {khc.n_features_evaluated_}") + print(f"Features selected : {khc.n_features_used_}") + print("Top 3 used features") + for i, feature in enumerate(khc.feature_used_names_[:3]): + print(f"{feature} - Importance: {khc.feature_used_importances_[i][2]}") + print("---") + # Predict the class on the test dataset y_pred = khc.predict(X) print("Predicted classes (first 10):") @@ -420,6 +436,14 @@ Samples # Train the regressor khr.fit(X_train, y_train) + # Show the feature importance info + print(f"Features evaluated: {khr.n_features_evaluated_}") + print(f"Features selected : {khr.n_features_used_}") + print("Top 3 used features") + for i, feature in enumerate(khr.feature_used_names_[:3]): + print(f"{feature} - Importance: {khr.feature_used_importances_[i][2]}") + print("---") + # Predict the values on the test dataset y_test_pred = khr.predict(X_test) print("Predicted values for 'age' (first 10):") @@ -561,6 +585,13 @@ Samples khe = KhiopsEncoder(n_features=10) khe.fit(X, y) + # Show the feature importance info + print(f"Features evaluated: {khe.n_features_evaluated_}") + print("Top 3 evaluated features") + for i, feature in enumerate(khe.feature_evaluated_names_[:3]): + print(f"{feature} - Level: {khe.feature_evaluated_importances_[i][0]}") + print("---") + # Transform the train dataset print("Encoded feature names:") print(khe.feature_names_out_) diff --git a/khiops/samples/samples_sklearn.ipynb b/khiops/samples/samples_sklearn.ipynb index 98c2ce2e..b7fcf0fd 100644 --- a/khiops/samples/samples_sklearn.ipynb +++ b/khiops/samples/samples_sklearn.ipynb @@ -54,6 +54,14 @@ "# Train the classifier\n", "khc.fit(X_train, y_train)\n", "\n", + "# Show the feature importance info\n", + "print(f\"Features evaluated: {khc.n_features_evaluated_}\")\n", + "print(f\"Features selected : {khc.n_features_used_}\")\n", + "print(\"Top 3 used features\")\n", + "for i, feature in enumerate(khc.feature_used_names_[:3]):\n", + " print(f\"{feature} - Importance: {khc.feature_used_importances_[i][2]}\")\n", + "print(\"---\")\n", + "\n", "# Predict the classes on the test dataset\n", "y_test_pred = khc.predict(X_test)\n", "print(\"Predicted classes (first 10):\")\n", @@ -298,6 +306,14 @@ "khc = KhiopsClassifier(n_trees=0)\n", "khc.fit(X, y)\n", "\n", + "# Show the feature importance info\n", + "print(f\"Features evaluated: {khc.n_features_evaluated_}\")\n", + "print(f\"Features selected : {khc.n_features_used_}\")\n", + "print(\"Top 3 used features\")\n", + "for i, feature in enumerate(khc.feature_used_names_[:3]):\n", + " print(f\"{feature} - Importance: {khc.feature_used_importances_[i][2]}\")\n", + "print(\"---\")\n", + "\n", "# Predict the class on the test dataset\n", "y_pred = khc.predict(X)\n", "print(\"Predicted classes (first 10):\")\n", @@ -484,6 +500,14 @@ "# Train the regressor\n", "khr.fit(X_train, y_train)\n", "\n", + "# Show the feature importance info\n", + "print(f\"Features evaluated: {khr.n_features_evaluated_}\")\n", + "print(f\"Features selected : {khr.n_features_used_}\")\n", + "print(\"Top 3 used features\")\n", + "for i, feature in enumerate(khr.feature_used_names_[:3]):\n", + " print(f\"{feature} - Importance: {khr.feature_used_importances_[i][2]}\")\n", + "print(\"---\")\n", + "\n", "# Predict the values on the test dataset\n", "y_test_pred = khr.predict(X_test)\n", "print(\"Predicted values for 'age' (first 10):\")\n", @@ -664,6 +688,13 @@ "khe = KhiopsEncoder(n_features=10)\n", "khe.fit(X, y)\n", "\n", + "# Show the feature importance info\n", + "print(f\"Features evaluated: {khe.n_features_evaluated_}\")\n", + "print(\"Top 3 evaluated features\")\n", + "for i, feature in enumerate(khe.feature_evaluated_names_[:3]):\n", + " print(f\"{feature} - Level: {khe.feature_evaluated_importances_[i][0]}\")\n", + "print(\"---\")\n", + "\n", "# Transform the train dataset\n", "print(\"Encoded feature names:\")\n", "print(khe.feature_names_out_)\n", diff --git a/khiops/samples/samples_sklearn.py b/khiops/samples/samples_sklearn.py index 603d040e..7d87d5d8 100644 --- a/khiops/samples/samples_sklearn.py +++ b/khiops/samples/samples_sklearn.py @@ -57,6 +57,14 @@ def khiops_classifier(): # Train the classifier khc.fit(X_train, y_train) + # Show the feature importance info + print(f"Features evaluated: {khc.n_features_evaluated_}") + print(f"Features selected : {khc.n_features_used_}") + print("Top 3 used features") + for i, feature in enumerate(khc.feature_used_names_[:3]): + print(f"{feature} - Importance: {khc.feature_used_importances_[i][2]}") + print("---") + # Predict the classes on the test dataset y_test_pred = khc.predict(X_test) print("Predicted classes (first 10):") @@ -273,6 +281,14 @@ def khiops_classifier_multitable_snowflake(): khc = KhiopsClassifier(n_trees=0) khc.fit(X, y) + # Show the feature importance info + print(f"Features evaluated: {khc.n_features_evaluated_}") + print(f"Features selected : {khc.n_features_used_}") + print("Top 3 used features") + for i, feature in enumerate(khc.feature_used_names_[:3]): + print(f"{feature} - Importance: {khc.feature_used_importances_[i][2]}") + print("---") + # Predict the class on the test dataset y_pred = khc.predict(X) print("Predicted classes (first 10):") @@ -423,6 +439,14 @@ def khiops_regressor(): # Train the regressor khr.fit(X_train, y_train) + # Show the feature importance info + print(f"Features evaluated: {khr.n_features_evaluated_}") + print(f"Features selected : {khr.n_features_used_}") + print("Top 3 used features") + for i, feature in enumerate(khr.feature_used_names_[:3]): + print(f"{feature} - Importance: {khr.feature_used_importances_[i][2]}") + print("---") + # Predict the values on the test dataset y_test_pred = khr.predict(X_test) print("Predicted values for 'age' (first 10):") @@ -581,6 +605,13 @@ def khiops_encoder_multitable_snowflake(): khe = KhiopsEncoder(n_features=10) khe.fit(X, y) + # Show the feature importance info + print(f"Features evaluated: {khe.n_features_evaluated_}") + print("Top 3 evaluated features") + for i, feature in enumerate(khe.feature_evaluated_names_[:3]): + print(f"{feature} - Level: {khe.feature_evaluated_importances_[i][0]}") + print("---") + # Transform the train dataset print("Encoded feature names:") print(khe.feature_names_out_) diff --git a/khiops/sklearn/estimators.py b/khiops/sklearn/estimators.py index e6ea9151..44ccc423 100644 --- a/khiops/sklearn/estimators.py +++ b/khiops/sklearn/estimators.py @@ -1161,7 +1161,7 @@ def predict(self, X): Returns ------- - `numpy.ndarray` + `ndarray ` An array containing the encoded columns. A first column containing key column ids is added in multi-table mode. @@ -1529,6 +1529,63 @@ def _fit_training_post_process(self, dataset): ) model_main_dictionary.remove_variable(self.model_target_variable_name_) + # Extract, from the preparation reports, the number of evaluated features, + # their names and their levels + univariate_preparation_report = self.model_report_.preparation_report + if self.model_report_.bivariate_preparation_report is not None: + bivariate_preparation_report = ( + self.model_report_.bivariate_preparation_report + ) + pair_feature_evaluated_names_ = ( + bivariate_preparation_report.get_variable_pair_names() + ) + pair_feature_evaluated_levels_ = [ + bivariate_preparation_report.get_variable_pair_statistics(*var).level + for var in bivariate_preparation_report.get_variable_pair_names() + ] + else: + pair_feature_evaluated_names_ = [] + pair_feature_evaluated_levels_ = [] + if "treePreparationReport" in self.model_report_raw_: + tree_preparation_report = self.model_report_raw_["treePreparationReport"][ + "variablesStatistics" + ] + tree_feature_evaluated_names_ = [ + tree_preparation_report[i]["name"] + for i in range(0, len(tree_preparation_report)) + ] + tree_feature_evaluated_levels_ = [ + tree_preparation_report[i]["level"] + for i in range(0, len(tree_preparation_report)) + ] + else: + tree_feature_evaluated_names_ = [] + tree_feature_evaluated_levels_ = [] + feature_evaluated_names_ = ( + univariate_preparation_report.get_variable_names() + + pair_feature_evaluated_names_ + + tree_feature_evaluated_names_ + ) + feature_evaluated_importances_ = np.array( + [ + univariate_preparation_report.get_variable_statistics(var).level + for var in univariate_preparation_report.get_variable_names() + ] + + pair_feature_evaluated_levels_ + + tree_feature_evaluated_levels_ + ) + + # Sort the features by level + combined = list(zip(feature_evaluated_names_, feature_evaluated_importances_)) + combined.sort(key=lambda x: x[1], reverse=True) + + # Set the sklearn attributes + self.feature_evaluated_names_ = np.array( + [x[0] for x in combined], dtype=np.dtype("object") + ) + self.feature_evaluated_importances_ = np.array([x[1] for x in combined]) + self.n_features_evaluated_ = len(combined) + def _transform_check_dataset(self, dataset): assert isinstance(dataset, Dataset), "'dataset' is not 'Dataset'" @@ -1670,9 +1727,30 @@ def _transform_prepare_deployment_model_for_predict(self): variable.used = False return model_copy + def get_feature_used_statistics(self, modeling_report): + # Extract, from the modeling report, names, levels, weights and importances + # of the selected features. + if modeling_report.selected_variables is not None: + feature_used_names_ = np.array( + [var.name for var in modeling_report.selected_variables] + ) + feature_used_importances_ = np.array( + [ + [var.level, var.weight, var.importance] + for var in modeling_report.selected_variables + ] + ) + # Return empty arrays if not selected_variables is available + else: + feature_used_names_ = np.array([], dtype=np.dtype("` of shape (n_classes\_,) + The list of classes seen in training. Depending on the training target, the contents are ``int`` or ``str``. + n_features_evaluated_ : int + The number of features evaluated by the classifier. + feature_evaluated_names_ : `ndarray ` of shape (n_features_evaluated\_,) + Names of the features evaluated by the classifier. + feature_evaluated_importances_ : `ndarray ` of shape (n_features_evaluated\_,) + Level of the features evaluated by the classifier. + See below for a definition of the level. + n_features_used_ : int + The number of features used by the classifier. + feature_used_names_ : `ndarray ` of shape (n_features_used\_, ) + Names of the features used by the classifier. + feature_used_importances_ : `ndarray ` of shape (n_features_used\_, 3) + Level, Weight and Importance of the features used by the classifier: + + - Level: A measure of the predictive importance of the feature taken + individually. It ranges between 0 (no predictive interest) and 1 (optimal + predictive importance). + + - Weight: A measure of the predictive importance of the feature taken relative + to all features selected by the classifier. It ranges between 0 (little + contribution to the model) and 1 (large contribution to the model). + + - Importance: The geometric mean between the Level and the Weight. + is_fitted_ : bool ``True`` if the estimator is fitted. is_multitable_model_ : bool @@ -1755,6 +1859,7 @@ class KhiopsClassifier(KhiopsPredictor, ClassifierMixin): - `samples_sklearn.khiops_classifier_pickle()` - `samples_sklearn.khiops_classifier_multitable_star_file()` """ + # pylint: enable=line-too-long def __init__( self, @@ -1882,6 +1987,9 @@ def _fit_training_post_process(self, dataset): self.classes_.sort() self.classes_ = column_or_1d(self.classes_) + # Count number of classes + self.n_classes_ = len(self.classes_) + # Warn when there are no informative variables if self.model_report_.preparation_report.informative_variable_number == 0: warnings.warn( @@ -1890,13 +1998,23 @@ def _fit_training_post_process(self, dataset): stacklevel=6, ) - # Set the target class probabilites as used - # (only the predicted classes is obtained without this step prior to Khiops 10) + # Set the target class probabilities as used + # (only the predicted classes are obtained without this step prior to Khiops 10) for variable in self._get_main_dictionary().variables: for key in variable.meta_data.keys: if key.startswith("TargetProb"): variable.used = True + # Extract statistics, about the selected features, from the modeling report + modeling_report = self.model_report_.modeling_report.get_snb_predictor() + if modeling_report.selected_variables is not None: + feature_used_names_, feature_used_importances_ = ( + self.get_feature_used_statistics(modeling_report) + ) + self.feature_used_names_ = feature_used_names_ + self.feature_used_importances_ = feature_used_importances_ + self.n_features_used_ = len(self.feature_used_names_) + def predict(self, X): """Predicts the most probable class for the test dataset X @@ -1917,7 +2035,7 @@ def predict(self, X): Returns ------- - `numpy.ndarray` + `ndarray ` An array containing the encoded columns. A first column containing key column ids is added in multi-table mode. The `numpy.dtype` of the array is integer if the classifier was learned with an integer ``y``. Otherwise it @@ -2031,11 +2149,14 @@ def _transform_prepare_deployment_model_for_predict_proba(self): variable.used = True else: variable.used = False + return model_copy class KhiopsRegressor(KhiopsPredictor, RegressorMixin): - """Khiops Selective Naive Bayes Regressor + # Disable line too long as this docstring *needs* to have lines longer than 88c + # pylint: disable=line-too-long + r"""Khiops Selective Naive Bayes Regressor This regressor supports automatic feature engineering on multi-table datasets. See :doc:`/multi_table_primer` for more details. @@ -2085,6 +2206,30 @@ class KhiopsRegressor(KhiopsPredictor, RegressorMixin): Attributes ---------- + n_features_evaluated_ : int + The number of features evaluated by the classifier. + feature_evaluated_names_ : `ndarray ` of shape (n_features_evaluated\_,) + Names of the features evaluated by the classifier. + feature_evaluated_importances_ : `ndarray ` of shape (n_features_evaluated\_,) + Level of the features evaluated by the classifier. + See below for a definition of the level. + n_features_used_ : int + The number of features used by the classifier. + feature_used_names_ : `ndarray ` of shape (n_features_used\_, ) + Names of the features used by the classifier. + feature_used_importances_ : `ndarray ` of shape (n_features_used\_, 3) + Level, Weight and Importance of the features used by the classifier: + + - Level: A measure of the predictive importance of the feature taken + individually. It ranges between 0 (no predictive interest) and 1 (optimal + predictive importance). + + - Weight: A measure of the predictive importance of the feature taken relative + to all features selected by the classifier. It ranges between 0 (little + contribution to the model) and 1 (large contribution to the model). + + - Importance: The geometric mean between the Level and the Weight. + is_fitted_ : bool ``True`` if the estimator is fitted. is_multitable_model_ : bool @@ -2105,6 +2250,7 @@ class KhiopsRegressor(KhiopsPredictor, RegressorMixin): See the following functions of the ``samples_sklearn.py`` documentation script: - `samples_sklearn.khiops_regressor()` """ + # pylint: enable=line-too-long def __init__( self, @@ -2195,6 +2341,16 @@ def _fit_training_post_process(self, dataset): for variable_name in variables_to_eliminate: self._get_main_dictionary().remove_variable(variable_name) + # Extract statistics, about the selected features, from the modeling report + modeling_report = self.model_report_.modeling_report.get_snb_predictor() + if modeling_report.selected_variables is not None: + feature_used_names_, feature_used_importances_ = ( + self.get_feature_used_statistics(modeling_report) + ) + self.feature_used_names_ = feature_used_names_ + self.feature_used_importances_ = feature_used_importances_ + self.n_features_used_ = len(self.feature_used_names_) + def _check_target_type(self, dataset): _check_numerical_target_type(dataset) @@ -2221,7 +2377,7 @@ def predict(self, X): Returns ------- - `numpy.ndarray` + `ndarray ` An array containing the encoded columns. A first column containing key column ids is added in multi-table mode. The key columns are added for multi-table tasks. @@ -2244,7 +2400,9 @@ def predict(self, X): class KhiopsEncoder(KhiopsSupervisedEstimator, TransformerMixin): - """Khiops supervised discretization/grouping encoder + # Disable line too long as this docstring *needs* to have lines longer than 88c + # pylint: disable=line-too-long + r"""Khiops supervised discretization/grouping encoder Parameters ---------- @@ -2313,6 +2471,14 @@ class KhiopsEncoder(KhiopsSupervisedEstimator, TransformerMixin): Attributes ---------- + n_features_evaluated_ : int + The number of features evaluated by the classifier. + feature_evaluated_names_ : `ndarray ` of shape (n_features_evaluated\_,) + Names of the features evaluated by the classifier. + feature_evaluated_importances_ : `ndarray ` of shape (n_features_evaluated\_,) + Level of the features evaluated by the classifier. The Level is measure of the + predictive importance of the feature taken individually. It ranges between 0 (no + predictive interest) and 1 (optimal predictive importance). is_fitted_ : bool ``True`` if the estimator is fitted. is_multitable_model_ : bool @@ -2539,7 +2705,7 @@ def transform(self, X): Returns ------- - `numpy.ndarray` + `ndarray ` An array containing the encoded columns. A first column containing key column ids is added in multi-table mode. diff --git a/tests/test_estimator_attributes.py b/tests/test_estimator_attributes.py new file mode 100644 index 00000000..f98d57d6 --- /dev/null +++ b/tests/test_estimator_attributes.py @@ -0,0 +1,294 @@ +###################################################################################### +# Copyright (c) 2024 Orange. All rights reserved. # +# This software is distributed under the BSD 3-Clause-clear License, the text of # +# which is available at https://spdx.org/licenses/BSD-3-Clause-Clear.html or # +# see the "LICENSE.md" file for more details. # +###################################################################################### +"""Test consistency of the estimator's attributes with the output reports""" +import unittest +import warnings +from os import path + +import numpy as np +import pandas as pd + +from khiops import core as kh +from khiops.sklearn.estimators import KhiopsClassifier, KhiopsEncoder, KhiopsRegressor + +# Disable PEP8 variable names because of scikit-learn X,y conventions +# To capture invalid-names other than X,y run: +# pylint --disable=all --enable=invalid-names estimators.py +# pylint: disable=invalid-name + + +class EstimatorAttributesTests(unittest.TestCase): + """Test consistency of the estimator's attributes with Khiops's output reports + + The following tests allow to verify that: + - The values of each estimator's attributes are consistent with the the reports + (PreparationReport.xls and ModelingReport.xls) produced by Khiops post training. + - The attributes are tested for all supervised estimators: KhiopsClassifier, + KhiopsRegressor and KhiopsEncoder. + - Two datasets are used for the tests: + - Adult (mono-table) + - Accidents (multitable). + """ + + def _create_multitable_input(self, size=None): + accidents_dataset_path = path.join(kh.get_samples_dir(), "Accidents") + accidents_df = pd.read_csv( + path.join(accidents_dataset_path, "Accidents.txt"), + sep="\t", + encoding="latin1", + ) + users_df = pd.read_csv( + path.join(accidents_dataset_path, "Users.txt"), sep="\t", encoding="latin1" + ) + vehicles_df = pd.read_csv( + path.join(accidents_dataset_path, "Vehicles.txt"), + sep="\t", + encoding="latin1", + ) + places_df = pd.read_csv( + path.join(accidents_dataset_path, "Places.txt"), + sep="\t", + encoding="latin1", + low_memory=False, + ) + + if size is None: + size = len(accidents_df) + + X = { + "main_table": "Accidents", + "tables": { + "Accidents": (accidents_df[:size], "AccidentId"), + "Vehicles": (vehicles_df, ["AccidentId", "VehicleId"]), + "Users": ( + users_df.drop("Gravity", axis=1), + ["AccidentId", "VehicleId"], + ), + "Places": (places_df, ["AccidentId"]), + }, + "relations": [ + ("Accidents", "Vehicles"), + ("Vehicles", "Users"), + ("Accidents", "Places", True), + ], + } + + y = pd.read_csv( + path.join(kh.get_samples_dir(), "AccidentsSummary", "Accidents.txt"), + sep="\t", + encoding="latin1", + )["Gravity"][:size] + + return X, y + + def assert_attribute_values_ok(self, model, X, y): + # Special checks for KhiopsClassifier + if isinstance(model, KhiopsClassifier): + self.assertEqual(model.classes_.tolist(), sorted(y.unique())) + self.assertEqual(model.n_classes_, len(y.unique())) + self.assertEqual(model.n_features_in_, len(X.columns)) + + # Extract the features and their levels from the report + # TODO: Eliminate this as this is the implementation + # Think of a better lighter test: For example verify that the variable are + # in order within the 3 feature lists (simple, pairs and trees). + # Do similarly below with the selected variables. + univariate_preparation_report = model.model_report_.preparation_report + if model.model_report_.bivariate_preparation_report is not None: + bivariate_preparation_report = ( + model.model_report_.bivariate_preparation_report + ) + pair_feature_evaluated_names_ = ( + bivariate_preparation_report.get_variable_pair_names() + ) + pair_feature_evaluated_levels_ = [ + [ + bivariate_preparation_report.get_variable_pair_statistics( + var[0], var[1] + ).level + ] + for var in bivariate_preparation_report.get_variable_pair_names() + ] + else: + pair_feature_evaluated_names_ = [] + pair_feature_evaluated_levels_ = [] + if "treePreparationReport" in model.model_report_raw_: + tree_preparation_report = model.model_report_raw_["treePreparationReport"][ + "variablesStatistics" + ] + tree_feature_evaluated_names_ = [ + tree_preparation_report[i]["name"] + for i in range(0, len(tree_preparation_report)) + ] + tree_feature_evaluated_levels_ = [ + [tree_preparation_report[i]["level"]] + for i in range(0, len(tree_preparation_report)) + ] + else: + tree_feature_evaluated_names_ = [] + tree_feature_evaluated_levels_ = [] + + feature_evaluated_names_report_ = ( + univariate_preparation_report.get_variable_names() + + pair_feature_evaluated_names_ + + tree_feature_evaluated_names_ + ) + feature_evaluated_importances_report = np.array( + [ + [univariate_preparation_report.get_variable_statistics(var).level] + for var in univariate_preparation_report.get_variable_names() + ] + + pair_feature_evaluated_levels_ + + tree_feature_evaluated_levels_ + ) + + # Sort the features by level + combined = list( + zip(feature_evaluated_names_report_, feature_evaluated_importances_report) + ) + combined.sort(key=lambda x: x[1], reverse=True) + feature_names = list(x[0] for x in combined) + feature_levels = list(x[1] for x in combined) + + # Check that the features and their levels were extracted in order + self.assertEqual( + model.n_features_evaluated_, len(feature_evaluated_names_report_) + ) + self.assertEqual(model.feature_evaluated_names_.tolist(), list(feature_names)) + self.assertEqual(model.feature_evaluated_importances_.tolist(), feature_levels) + + modeling_report = model.model_report_.modeling_report + # Check the selected variables for the regressor and classifier + if not isinstance(model, KhiopsEncoder): + # Extract the selected variables and their importances from the report + # TODO: See TODO above + feature_used_names = [ + var.name + for var in modeling_report.get_snb_predictor().selected_variables + ] + feature_used_importances_report = [ + [var.level, var.weight, var.importance] + for var in modeling_report.get_snb_predictor().selected_variables + ] + + self.assertEqual(model.feature_used_names_.tolist(), feature_used_names) + self.assertEqual( + model.feature_used_importances_.tolist(), + feature_used_importances_report, + ) + self.assertEqual( + model.n_features_used_, len(feature_used_importances_report) + ) + self.assertTrue(model.is_fitted_) + + def test_classifier_attributes_monotable(self): + """Test consistency of KhiopsClassifier's attributes with the output reports + + - This test verifies that the values of a trained KhiopsClassifier, on a + a monotable dataset (Adult), are consistent with the reports produced by Khiops + post training. + """ + adult_dataset_path = path.join(kh.get_samples_dir(), "Adult", "Adult.txt") + adult_df = pd.read_csv(adult_dataset_path, sep="\t") + X = adult_df.drop("class", axis=1) + y = adult_df["class"] + khc_adult = KhiopsClassifier() + khc_adult.fit(X, y) + + self.assert_attribute_values_ok(khc_adult, X, y) + self.assertFalse(khc_adult.is_multitable_model_) + + def test_classifier_attributes_multitable(self): + """Test consistency of KhiopsClassifier's attributes with the output reports + + - This test verifies that the values of a trained KhiopsClassifier, on a + a multitable dataset (Accidents), are consistent with the reports produced + by Khiops post training. + """ + X, y = self._create_multitable_input() + khc_accidents = KhiopsClassifier(n_trees=0, n_pairs=10) + khc_accidents.fit(X, y) + self.assert_attribute_values_ok(khc_accidents, X["tables"]["Accidents"][0], y) + self.assertTrue(khc_accidents.is_multitable_model_) + + def test_regressor_attributes_monotable(self): + """Test consistency of KhiopsRegressor's attributes with the output reports + + - This test verifies that the values of a trained KhiopsRegressor, on a + a monotable dataset (Adult), are consistent with the reports produced by Khiops + post training. + """ + adult_dataset_path = path.join(kh.get_samples_dir(), "Adult", "Adult.txt") + adult_df = pd.read_csv(adult_dataset_path, sep="\t").sample(750) + X = adult_df.drop("age", axis=1) + y = adult_df["age"] + khr_adult = KhiopsRegressor(n_trees=0, n_pairs=5) + with warnings.catch_warnings(): + warnings.filterwarnings( + action="ignore", + category=UserWarning, + message="Khiops ended correctly but there were minor issues", + ) + khr_adult.fit(X, y) + + self.assert_attribute_values_ok(khr_adult, X, None) + self.assertFalse(khr_adult.is_multitable_model_) + + def test_regressor_attributes_multitable(self): + """Test consistency of KhiopsRegressor's attributes with the output reports + + - This test verifies that the values of a trained KhiopsRegressor, on a + a multitable dataset (Accidents), are consistent with the reports produced + by Khiops post training. + """ + X, _ = self._create_multitable_input(750) + y = X["tables"]["Accidents"][0]["Commune"] + X["tables"]["Accidents"][0].drop("Commune", axis=1, inplace=True) + khr_accidents = KhiopsRegressor(n_trees=0) + with warnings.catch_warnings(): + warnings.filterwarnings( + action="ignore", + category=UserWarning, + message="Khiops ended correctly but there were minor issues", + ) + khr_accidents.fit(X, y) + + self.assert_attribute_values_ok( + khr_accidents, X["tables"]["Accidents"][0], None + ) + self.assertTrue(khr_accidents.is_multitable_model_) + + def test_encoder_attributes_monotable(self): + """Test consistency of KhiopsEncoder's attributes with the output reports + + - This test verifies that the values of a trained KhiopsEncoder, on a + a monotable dataset (Adult), are consistent with the reports produced + by Khiops post training. + """ + adult_dataset_path = path.join(kh.get_samples_dir(), "Adult", "Adult.txt") + adult_df = pd.read_csv(adult_dataset_path, sep="\t") + X = adult_df.drop("class", axis=1) + y = adult_df["class"] + khe_adult = KhiopsEncoder() + khe_adult.fit(X, y) + + self.assert_attribute_values_ok(khe_adult, X, None) + self.assertFalse(khe_adult.is_multitable_model_) + + def test_encoder_attributes_multitable(self): + """Test consistency of KhiopsEncoder's attributes with the output reports + + - This test verifies that the values of a trained KhiopsEncoder, on a + a multitable dataset (Accidents), are consistent with the reports produced + by Khiops post training. + """ + X, y = self._create_multitable_input() + khe_accidents = KhiopsEncoder(n_trees=5) + khe_accidents.fit(X, y) + + self.assert_attribute_values_ok(khe_accidents, X, None) + self.assertTrue(khe_accidents.is_multitable_model_)