tinkoff-ai · Mr-Geekman · May 3, 2023 · Apr 27, 2023 · Apr 27, 2023 · Apr 27, 2023
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -56,6 +56,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Add default `params_to_tune` for `TimeSeriesImputerTransform` ([#1232](https://github.com/tinkoff-ai/etna/pull/1232))
 - Add default `params_to_tune` for `DifferencingTransform`, `MedianTransform`, `MaxTransform`, `MinTransform`, `QuantileTransform`, `StdTransform`, `MeanTransform`, `MADTransform`, `MinMaxDifferenceTransform`, `SumTransform`, `BoxCoxTransform`, `YeoJohnsonTransform`, `MaxAbsScalerTransform`, `MinMaxScalerTransform`, `RobustScalerTransform` and `StandardScalerTransform` ([#1233](https://github.com/tinkoff-ai/etna/pull/1233))
 - Add default `params_to_tune` for `LabelEncoderTransform` ([#1242](https://github.com/tinkoff-ai/etna/pull/1242))
+- Add default `params_to_tune` for `TreeFeatureSelectionTransform`, `MRMRFeatureSelectionTransform` and `GaleShapleyFeatureSelectionTransform` ([#1250](https://github.com/tinkoff-ai/etna/pull/1250))
 ### Fixed
 - Fix bug in `GaleShapleyFeatureSelectionTransform` with wrong number of remaining features ([#1110](https://github.com/tinkoff-ai/etna/pull/1110))
 - `ProphetModel` fails with additional seasonality set ([#1157](https://github.com/tinkoff-ai/etna/pull/1157))

diff --git a/etna/transforms/feature_selection/feature_importance.py b/etna/transforms/feature_selection/feature_importance.py
@@ -14,12 +14,19 @@
 from sklearn.tree import ExtraTreeRegressor
 from typing_extensions import Literal
 
+from etna import SETTINGS
 from etna.analysis import RelevanceTable
 from etna.analysis.feature_selection.mrmr_selection import AggregationMode
 from etna.analysis.feature_selection.mrmr_selection import mrmr
 from etna.datasets import TSDataset
 from etna.transforms.feature_selection import BaseFeatureSelectionTransform
 
+if SETTINGS.auto_required:
+    from optuna.distributions import BaseDistribution
+    from optuna.distributions import CategoricalDistribution
+    from optuna.distributions import IntUniformDistribution
+
+
 TreeBasedRegressor = Union[
     DecisionTreeRegressor,
     ExtraTreeRegressor,
@@ -41,7 +48,7 @@ class TreeFeatureSelectionTransform(BaseFeatureSelectionTransform):
 
     def __init__(
         self,
-        model: TreeBasedRegressor,
+        model: Union[Literal["catboost"], Literal["random_forest"], TreeBasedRegressor],
         top_k: int,
         features_to_use: Union[List[str], Literal["all"]] = "all",
         return_features: bool = False,
@@ -52,8 +59,18 @@ def __init__(
         Parameters
         ----------
         model:
-            model to make selection, it should have ``feature_importances_`` property
-            (e.g. all tree-based regressors in sklearn)
+            Model to make selection, it should have ``feature_importances_`` property
+            (e.g. all tree-based regressors in sklearn).
+
+            If ``catboost.CatBoostRegressor`` is given with no ``cat_features`` parameter,
+            then ``cat_features`` are set during ``fit`` to be equal to columns of category type.
+
+            Pre-defined options are also available:
+
+            * catboost: ``catboost.CatBoostRegressor(iterations=1000, silent=True)``;
+
+            * random_forest: ``sklearn.ensemble.RandomForestRegressor(n_estimators=100, random_state=0)``.
+
         top_k:
             num of features to select; if there are not enough features, then all will be selected
         features_to_use:
@@ -64,8 +81,16 @@ def __init__(
         if not isinstance(top_k, int) or top_k < 0:
             raise ValueError("Parameter top_k should be positive integer")
         super().__init__(features_to_use=features_to_use, return_features=return_features)
-        self.model = model
         self.top_k = top_k
+        if isinstance(model, str):
+            if model == "catboost":
+                self.model = CatBoostRegressor(iterations=1000, silent=True)
+            elif model == "random_forest":
+                self.model = RandomForestRegressor(random_state=0)
+            else:
+                raise ValueError(f"Not a valid option for model: {model}")
+        else:
+            self.model = model
 
     def _get_train(self, df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
         """Get train data for model."""
@@ -78,7 +103,12 @@ def _get_train(self, df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
     def _get_features_weights(self, df: pd.DataFrame) -> Dict[str, float]:
         """Get weights for features based on model feature importances."""
         train_data, train_target = self._get_train(df)
-        self.model.fit(train_data, train_target)
+        if isinstance(self.model, CatBoostRegressor) and self.model.get_param("cat_features") is None:
+            dtypes = train_data.dtypes
+            cat_features = dtypes[dtypes == "category"].index.tolist()
+            self.model.fit(train_data, train_target, cat_features=cat_features)
+        else:
+            self.model.fit(train_data, train_target)
         weights_array = self.model.feature_importances_
         weights_dict = {column: weights_array[i] for i, column in enumerate(train_data.columns)}
         return weights_dict
@@ -102,7 +132,7 @@ def _fit(self, df: pd.DataFrame) -> "TreeFeatureSelectionTransform":
 
         Returns
         -------
-        result: TreeFeatureSelectionTransform
+        result:
             instance after fitting
         """
         if len(self._get_features_to_use(df)) == 0:
@@ -112,6 +142,24 @@ def _fit(self, df: pd.DataFrame) -> "TreeFeatureSelectionTransform":
         self.selected_features = self._select_top_k_features(weights, self.top_k)
         return self
 
+    def params_to_tune(self) -> Dict[str, "BaseDistribution"]:
+        """Get default grid for tuning hyperparameters.
+
+        This grid tunes parameters: ``model``, ``top_k``. Other parameters are expected to be set by the user.
+
+        For ``model`` parameter only pre-defined options are suggested.
+        For ``top_k`` parameter the maximum suggested value is not greater than ``self.top_k``.
+
+        Returns
+        -------
+        :
+            Grid to tune.
+        """
+        return {
+            "model": CategoricalDistribution(["catboost", "random_forest"]),
+            "top_k": IntUniformDistribution(low=1, high=self.top_k),
+        }
+
 
 class MRMRFeatureSelectionTransform(BaseFeatureSelectionTransform):
     """Transform that selects features according to MRMR variable selection method adapted to the timeseries case.
@@ -176,7 +224,7 @@ def _fit(self, df: pd.DataFrame) -> "MRMRFeatureSelectionTransform":
 
         Returns
         -------
-        result: MRMRFeatureSelectionTransform
+        result:
             instance after fitting
         """
         features = self._get_features_to_use(df)
@@ -193,3 +241,19 @@ def _fit(self, df: pd.DataFrame) -> "MRMRFeatureSelectionTransform":
             atol=self.atol,
         )
         return self
+
+    def params_to_tune(self) -> Dict[str, "BaseDistribution"]:
+        """Get default grid for tuning hyperparameters.
+
+        This grid tunes only ``top_k`` parameter. Other parameters are expected to be set by the user.
+
+        For ``top_k`` parameter the maximum suggested value is not greater than ``self.top_k``.
+
+        Returns
+        -------
+        :
+            Grid to tune.
+        """
+        return {
+            "top_k": IntUniformDistribution(low=1, high=self.top_k),
+        }
diff --git a/etna/transforms/feature_selection/gale_shapley.py b/etna/transforms/feature_selection/gale_shapley.py
@@ -8,10 +8,16 @@
 import pandas as pd
 from typing_extensions import Literal
 
+from etna import SETTINGS
 from etna.analysis import RelevanceTable
 from etna.core import BaseMixin
 from etna.transforms.feature_selection.base import BaseFeatureSelectionTransform
 
+if SETTINGS.auto_required:
+    from optuna.distributions import BaseDistribution
+    from optuna.distributions import CategoricalDistribution
+    from optuna.distributions import IntUniformDistribution
+
 
 class BaseGaleShapley(BaseMixin):
     """Base class for a member of Gale-Shapley matching."""
@@ -385,3 +391,20 @@ def _fit(self, df: pd.DataFrame) -> "GaleShapleyFeatureSelectionTransform":
                 segment_features_ranking=segment_features_ranking, features_to_drop=selected_features
             )
         return self
+
+    def params_to_tune(self) -> Dict[str, "BaseDistribution"]:
+        """Get default grid for tuning hyperparameters.
+
+        This grid tunes parameters: ``top_k``, ``use_rank``. Other parameters are expected to be set by the user.
+
+        For ``top_k`` parameter the maximum suggested value is not greater than ``self.top_k``.
+
+        Returns
+        -------
+        :
+            Grid to tune.
+        """
+        return {
+            "top_k": IntUniformDistribution(low=1, high=self.top_k),
+            "use_rank": CategoricalDistribution([False, True]),
+        }
diff --git a/tests/test_transforms/test_feature_selection/test_feature_importance_transform.py b/tests/test_transforms/test_feature_selection/test_feature_importance_transform.py
@@ -18,6 +18,7 @@
 from etna.transforms import SegmentEncoderTransform
 from etna.transforms.feature_selection import TreeFeatureSelectionTransform
 from etna.transforms.feature_selection.feature_importance import MRMRFeatureSelectionTransform
+from tests.test_transforms.utils import assert_sampling_is_valid
 from tests.test_transforms.utils import assert_transformation_equals_loaded_original
 
 
@@ -67,9 +68,37 @@ def ts_with_regressors():
     )
 
 
+@pytest.fixture
+def ts_with_regressors_and_features(ts_with_regressors):
+    le_encoder = SegmentEncoderTransform()
+    le_encoder.fit_transform(ts_with_regressors)
+    return ts_with_regressors
+
+
+def test_create_with_unknown_model(ts_with_exog):
+    with pytest.raises(ValueError, match="Not a valid option for model: .*"):
+        _ = TreeFeatureSelectionTransform(model="unknown", top_k=3, features_to_use="all")
+
+
 @pytest.mark.parametrize(
     "model",
     [
+        "catboost",
+        CatBoostRegressor(iterations=10, random_state=42, silent=True),
+        CatBoostRegressor(iterations=10, random_state=42, silent=True, cat_features=["segment_code"]),
+    ],
+)
+def test_catboost_with_cat_features(model, ts_with_regressors_and_features):
+    """Check that transform with catboost model can work with cat features in a dataset."""
+    selector = TreeFeatureSelectionTransform(model=model, top_k=3, features_to_use="all")
+    selector.fit_transform(ts_with_regressors_and_features)
+
+
+@pytest.mark.parametrize(
+    "model",
+    [
+        "random_forest",
+        "catboost",
         DecisionTreeRegressor(random_state=42),
         ExtraTreeRegressor(random_state=42),
         RandomForestRegressor(n_estimators=10, random_state=42),
@@ -86,23 +115,21 @@ def test_work_with_non_regressors(ts_with_exog, model):
 @pytest.mark.parametrize(
     "model",
     [
+        "random_forest",
+        "catboost",
         DecisionTreeRegressor(random_state=42),
         ExtraTreeRegressor(random_state=42),
         RandomForestRegressor(n_estimators=10, random_state=42),
         ExtraTreesRegressor(n_estimators=10, random_state=42),
         GradientBoostingRegressor(n_estimators=10, random_state=42),
-        CatBoostRegressor(iterations=10, random_state=42, silent=True, cat_features=["segment_code"]),
+        CatBoostRegressor(iterations=10, random_state=42, silent=True),
     ],
 )
 @pytest.mark.parametrize("top_k", [0, 1, 5, 15, 50])
 def test_selected_top_k_regressors(model, top_k, ts_with_regressors):
     """Check that transform selects exactly top_k regressors if where are this much."""
-    all_regressors = ts_with_regressors.regressors
-    all_regressors.append("segment_code")
-
     ts = ts_with_regressors
-    le_encoder = SegmentEncoderTransform()
-    le_encoder.fit_transform(ts)
+    all_regressors = ts_with_regressors.regressors
     selector = TreeFeatureSelectionTransform(model=model, top_k=top_k)
     selector.fit_transform(ts)
 
@@ -113,25 +140,25 @@ def test_selected_top_k_regressors(model, top_k, ts_with_regressors):
 @pytest.mark.parametrize(
     "model",
     [
+        "random_forest",
+        "catboost",
         DecisionTreeRegressor(random_state=42),
         ExtraTreeRegressor(random_state=42),
         RandomForestRegressor(n_estimators=10, random_state=42),
         ExtraTreesRegressor(n_estimators=10, random_state=42),
         GradientBoostingRegressor(n_estimators=10, random_state=42),
-        CatBoostRegressor(iterations=10, random_state=42, silent=True, cat_features=["segment_code"]),
+        CatBoostRegressor(iterations=10, random_state=42, silent=True),
     ],
 )
 @pytest.mark.parametrize("top_k", [0, 1, 5, 15, 50])
 def test_retain_values(model, top_k, ts_with_regressors):
     """Check that transform doesn't change values of columns."""
     ts = ts_with_regressors
-    le_encoder = SegmentEncoderTransform()
-    le_encoder.fit_transform(ts)
     df_encoded = ts.to_pandas()
     selector = TreeFeatureSelectionTransform(model=model, top_k=top_k)
     df_selected = selector.fit_transform(ts).to_pandas()
 
-    for segment in ts_with_regressors.segments:
+    for segment in ts.segments:
         for column in df_selected.columns.get_level_values("feature").unique():
             assert (
                 df_selected.loc[:, pd.IndexSlice[segment, column]] == df_encoded.loc[:, pd.IndexSlice[segment, column]]
@@ -141,6 +168,8 @@ def test_retain_values(model, top_k, ts_with_regressors):
 @pytest.mark.parametrize(
     "model",
     [
+        "random_forest",
+        "catboost",
         DecisionTreeRegressor(random_state=42),
         ExtraTreeRegressor(random_state=42),
         RandomForestRegressor(n_estimators=10, random_state=42),
@@ -158,6 +187,8 @@ def test_fails_negative_top_k(model):
 @pytest.mark.parametrize(
     "model",
     [
+        "random_forest",
+        "catboost",
         DecisionTreeRegressor(random_state=42),
         ExtraTreeRegressor(random_state=42),
         RandomForestRegressor(n_estimators=10, random_state=42),
@@ -178,20 +209,20 @@ def test_warns_no_regressors(model, example_tsds):
 @pytest.mark.parametrize(
     "model",
     [
+        "random_forest",
+        "catboost",
         DecisionTreeRegressor(random_state=42),
         ExtraTreeRegressor(random_state=42),
         RandomForestRegressor(n_estimators=10, random_state=42),
         ExtraTreesRegressor(n_estimators=10, random_state=42),
         GradientBoostingRegressor(n_estimators=10, random_state=42),
-        CatBoostRegressor(iterations=700, random_state=42, silent=True, cat_features=["segment_code"]),
+        CatBoostRegressor(iterations=700, random_state=42, silent=True),
     ],
 )
 def test_sanity_selected(model, ts_with_regressors):
     """Check that transform correctly finds meaningful regressors."""
     ts = ts_with_regressors
-    le_encoder = SegmentEncoderTransform()
-    le_encoder.fit_transform(ts)
-    selector = TreeFeatureSelectionTransform(model=model, top_k=8)
+    selector = TreeFeatureSelectionTransform(model=model, top_k=10)
     df_selected = selector.fit_transform(ts).to_pandas()
     features_columns = df_selected.columns.get_level_values("feature").unique()
     selected_regressors = [column for column in features_columns if column.startswith("regressor_")]
@@ -202,6 +233,8 @@ def test_sanity_selected(model, ts_with_regressors):
 @pytest.mark.parametrize(
     "model",
     [
+        "random_forest",
+        "catboost",
         DecisionTreeRegressor(random_state=42),
         ExtraTreeRegressor(random_state=42),
         RandomForestRegressor(n_estimators=10, random_state=42),
@@ -231,6 +264,8 @@ def test_sanity_model(model, ts_with_regressors):
 @pytest.mark.parametrize(
     "model",
     [
+        "random_forest",
+        "catboost",
         DecisionTreeRegressor(random_state=42),
         ExtraTreeRegressor(random_state=42),
         RandomForestRegressor(n_estimators=10, random_state=42),
@@ -286,3 +321,19 @@ def test_mrmr_right_regressors(relevance_table, ts_with_regressors):
 )
 def test_save_load(transform, ts_with_regressors):
     assert_transformation_equals_loaded_original(transform=transform, ts=ts_with_regressors)
+
+
+@pytest.mark.parametrize(
+    "transform",
+    [
+        TreeFeatureSelectionTransform(model=DecisionTreeRegressor(random_state=42), top_k=3),
+        MRMRFeatureSelectionTransform(
+            relevance_table=ModelRelevanceTable(), top_k=3, model=RandomForestRegressor(random_state=42)
+        ),
+        MRMRFeatureSelectionTransform(relevance_table=StatisticsRelevanceTable(), top_k=3),
+    ],
+)
+def test_params_to_tune(transform, ts_with_regressors):
+    ts = ts_with_regressors
+    assert len(transform.params_to_tune()) > 0
+    assert_sampling_is_valid(transform=transform, ts=ts)
diff --git a/tests/test_transforms/test_feature_selection/test_filter_transform.py b/tests/test_transforms/test_feature_selection/test_filter_transform.py
@@ -201,3 +201,16 @@ def test_inverse_transform_back_included_columns(ts_with_features, columns, retu
 )
 def test_save_load(transform, ts_with_features):
     assert_transformation_equals_loaded_original(transform=transform, ts=ts_with_features)
+
+
+@pytest.mark.parametrize(
+    "transform",
+    [
+        FilterFeaturesTransform(include=["target"], return_features=True),
+        FilterFeaturesTransform(include=["target"], return_features=False),
+        FilterFeaturesTransform(exclude=["exog_1", "exog_2"], return_features=False),
+        FilterFeaturesTransform(exclude=["exog_1", "exog_2"], return_features=False),
+    ],
+)
+def test_params_to_tune(transform):
+    assert len(transform.params_to_tune()) == 0