Skip to content

Add default params_to_tune for feature selection transforms #1250

Merged
merged 8 commits into from
May 3, 2023
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Add default `params_to_tune` for `TimeSeriesImputerTransform` ([#1232](https://github.com/tinkoff-ai/etna/pull/1232))
- Add default `params_to_tune` for `DifferencingTransform`, `MedianTransform`, `MaxTransform`, `MinTransform`, `QuantileTransform`, `StdTransform`, `MeanTransform`, `MADTransform`, `MinMaxDifferenceTransform`, `SumTransform`, `BoxCoxTransform`, `YeoJohnsonTransform`, `MaxAbsScalerTransform`, `MinMaxScalerTransform`, `RobustScalerTransform` and `StandardScalerTransform` ([#1233](https://github.com/tinkoff-ai/etna/pull/1233))
- Add default `params_to_tune` for `LabelEncoderTransform` ([#1242](https://github.com/tinkoff-ai/etna/pull/1242))
- Add default `params_to_tune` for `TreeFeatureSelectionTransform`, `MRMRFeatureSelectionTransform` and `GaleShapleyFeatureSelectionTransform` ([#1250](https://github.com/tinkoff-ai/etna/pull/1250))
### Fixed
- Fix bug in `GaleShapleyFeatureSelectionTransform` with wrong number of remaining features ([#1110](https://github.com/tinkoff-ai/etna/pull/1110))
- `ProphetModel` fails with additional seasonality set ([#1157](https://github.com/tinkoff-ai/etna/pull/1157))
Expand Down
78 changes: 71 additions & 7 deletions etna/transforms/feature_selection/feature_importance.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,19 @@
from sklearn.tree import ExtraTreeRegressor
from typing_extensions import Literal

from etna import SETTINGS
from etna.analysis import RelevanceTable
from etna.analysis.feature_selection.mrmr_selection import AggregationMode
from etna.analysis.feature_selection.mrmr_selection import mrmr
from etna.datasets import TSDataset
from etna.transforms.feature_selection import BaseFeatureSelectionTransform

if SETTINGS.auto_required:
from optuna.distributions import BaseDistribution
from optuna.distributions import CategoricalDistribution
from optuna.distributions import IntUniformDistribution


TreeBasedRegressor = Union[
DecisionTreeRegressor,
ExtraTreeRegressor,
Expand All @@ -41,7 +48,7 @@ class TreeFeatureSelectionTransform(BaseFeatureSelectionTransform):

def __init__(
self,
model: TreeBasedRegressor,
model: Union[Literal["catboost"], Literal["random_forest"], TreeBasedRegressor],
top_k: int,
features_to_use: Union[List[str], Literal["all"]] = "all",
return_features: bool = False,
Expand All @@ -52,8 +59,18 @@ def __init__(
Parameters
----------
model:
model to make selection, it should have ``feature_importances_`` property
(e.g. all tree-based regressors in sklearn)
Model to make selection, it should have ``feature_importances_`` property
(e.g. all tree-based regressors in sklearn).

If ``catboost.CatBoostRegressor`` is given with no ``cat_features`` parameter,
alex-hse-repository marked this conversation as resolved.
Show resolved Hide resolved
then ``cat_features`` are set during ``fit`` to be equal to columns of category type.

Pre-defined options are also available:

* catboost: ``catboost.CatBoostRegressor(iterations=1000, silent=True)``;

* random_forest: ``sklearn.ensemble.RandomForestRegressor(n_estimators=100, random_state=0)``.

top_k:
num of features to select; if there are not enough features, then all will be selected
features_to_use:
Expand All @@ -64,8 +81,16 @@ def __init__(
if not isinstance(top_k, int) or top_k < 0:
raise ValueError("Parameter top_k should be positive integer")
super().__init__(features_to_use=features_to_use, return_features=return_features)
self.model = model
self.top_k = top_k
if isinstance(model, str):
if model == "catboost":
self.model = CatBoostRegressor(iterations=1000, silent=True)
elif model == "random_forest":
self.model = RandomForestRegressor(random_state=0)
else:
raise ValueError(f"Not a valid option for model: {model}")
else:
self.model = model

def _get_train(self, df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
"""Get train data for model."""
Expand All @@ -78,7 +103,12 @@ def _get_train(self, df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
def _get_features_weights(self, df: pd.DataFrame) -> Dict[str, float]:
"""Get weights for features based on model feature importances."""
train_data, train_target = self._get_train(df)
self.model.fit(train_data, train_target)
if isinstance(self.model, CatBoostRegressor) and self.model.get_param("cat_features") is None:
dtypes = train_data.dtypes
cat_features = dtypes[dtypes == "category"].index.tolist()
self.model.fit(train_data, train_target, cat_features=cat_features)
else:
self.model.fit(train_data, train_target)
weights_array = self.model.feature_importances_
weights_dict = {column: weights_array[i] for i, column in enumerate(train_data.columns)}
return weights_dict
Expand All @@ -102,7 +132,7 @@ def _fit(self, df: pd.DataFrame) -> "TreeFeatureSelectionTransform":

Returns
-------
result: TreeFeatureSelectionTransform
result:
instance after fitting
"""
if len(self._get_features_to_use(df)) == 0:
Expand All @@ -112,6 +142,24 @@ def _fit(self, df: pd.DataFrame) -> "TreeFeatureSelectionTransform":
self.selected_features = self._select_top_k_features(weights, self.top_k)
return self

def params_to_tune(self) -> Dict[str, "BaseDistribution"]:
"""Get default grid for tuning hyperparameters.

This grid tunes parameters: ``model``, ``top_k``. Other parameters are expected to be set by the user.

For ``model`` parameter only pre-defined options are suggested.
For ``top_k`` parameter the maximum suggested value is not greater than ``self.top_k``.

Returns
-------
:
Grid to tune.
"""
return {
"model": CategoricalDistribution(["catboost", "random_forest"]),
"top_k": IntUniformDistribution(low=1, high=self.top_k),
}


class MRMRFeatureSelectionTransform(BaseFeatureSelectionTransform):
"""Transform that selects features according to MRMR variable selection method adapted to the timeseries case.
Expand Down Expand Up @@ -176,7 +224,7 @@ def _fit(self, df: pd.DataFrame) -> "MRMRFeatureSelectionTransform":

Returns
-------
result: MRMRFeatureSelectionTransform
result:
instance after fitting
"""
features = self._get_features_to_use(df)
Expand All @@ -193,3 +241,19 @@ def _fit(self, df: pd.DataFrame) -> "MRMRFeatureSelectionTransform":
atol=self.atol,
)
return self

def params_to_tune(self) -> Dict[str, "BaseDistribution"]:
alex-hse-repository marked this conversation as resolved.
Show resolved Hide resolved
"""Get default grid for tuning hyperparameters.

This grid tunes only ``top_k`` parameter. Other parameters are expected to be set by the user.

For ``top_k`` parameter the maximum suggested value is not greater than ``self.top_k``.

Returns
-------
:
Grid to tune.
"""
return {
"top_k": IntUniformDistribution(low=1, high=self.top_k),
}
23 changes: 23 additions & 0 deletions etna/transforms/feature_selection/gale_shapley.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,16 @@
import pandas as pd
from typing_extensions import Literal

from etna import SETTINGS
from etna.analysis import RelevanceTable
from etna.core import BaseMixin
from etna.transforms.feature_selection.base import BaseFeatureSelectionTransform

if SETTINGS.auto_required:
from optuna.distributions import BaseDistribution
from optuna.distributions import CategoricalDistribution
from optuna.distributions import IntUniformDistribution


class BaseGaleShapley(BaseMixin):
"""Base class for a member of Gale-Shapley matching."""
Expand Down Expand Up @@ -385,3 +391,20 @@ def _fit(self, df: pd.DataFrame) -> "GaleShapleyFeatureSelectionTransform":
segment_features_ranking=segment_features_ranking, features_to_drop=selected_features
)
return self

def params_to_tune(self) -> Dict[str, "BaseDistribution"]:
"""Get default grid for tuning hyperparameters.

This grid tunes parameters: ``top_k``, ``use_rank``. Other parameters are expected to be set by the user.

For ``top_k`` parameter the maximum suggested value is not greater than ``self.top_k``.

Returns
-------
:
Grid to tune.
"""
return {
"top_k": IntUniformDistribution(low=1, high=self.top_k),
"use_rank": CategoricalDistribution([False, True]),
}
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from etna.transforms import SegmentEncoderTransform
from etna.transforms.feature_selection import TreeFeatureSelectionTransform
from etna.transforms.feature_selection.feature_importance import MRMRFeatureSelectionTransform
from tests.test_transforms.utils import assert_sampling_is_valid
from tests.test_transforms.utils import assert_transformation_equals_loaded_original


Expand Down Expand Up @@ -67,9 +68,37 @@ def ts_with_regressors():
)


@pytest.fixture
def ts_with_regressors_and_features(ts_with_regressors):
le_encoder = SegmentEncoderTransform()
le_encoder.fit_transform(ts_with_regressors)
return ts_with_regressors


def test_create_with_unknown_model(ts_with_exog):
with pytest.raises(ValueError, match="Not a valid option for model: .*"):
_ = TreeFeatureSelectionTransform(model="unknown", top_k=3, features_to_use="all")


@pytest.mark.parametrize(
"model",
[
"catboost",
CatBoostRegressor(iterations=10, random_state=42, silent=True),
CatBoostRegressor(iterations=10, random_state=42, silent=True, cat_features=["segment_code"]),
],
)
def test_catboost_with_cat_features(model, ts_with_regressors_and_features):
"""Check that transform with catboost model can work with cat features in a dataset."""
selector = TreeFeatureSelectionTransform(model=model, top_k=3, features_to_use="all")
selector.fit_transform(ts_with_regressors_and_features)


@pytest.mark.parametrize(
"model",
[
"random_forest",
"catboost",
DecisionTreeRegressor(random_state=42),
ExtraTreeRegressor(random_state=42),
RandomForestRegressor(n_estimators=10, random_state=42),
Expand All @@ -86,23 +115,21 @@ def test_work_with_non_regressors(ts_with_exog, model):
@pytest.mark.parametrize(
"model",
[
"random_forest",
"catboost",
DecisionTreeRegressor(random_state=42),
ExtraTreeRegressor(random_state=42),
RandomForestRegressor(n_estimators=10, random_state=42),
ExtraTreesRegressor(n_estimators=10, random_state=42),
GradientBoostingRegressor(n_estimators=10, random_state=42),
CatBoostRegressor(iterations=10, random_state=42, silent=True, cat_features=["segment_code"]),
CatBoostRegressor(iterations=10, random_state=42, silent=True),
],
)
@pytest.mark.parametrize("top_k", [0, 1, 5, 15, 50])
def test_selected_top_k_regressors(model, top_k, ts_with_regressors):
"""Check that transform selects exactly top_k regressors if where are this much."""
all_regressors = ts_with_regressors.regressors
all_regressors.append("segment_code")

ts = ts_with_regressors
le_encoder = SegmentEncoderTransform()
le_encoder.fit_transform(ts)
all_regressors = ts_with_regressors.regressors
selector = TreeFeatureSelectionTransform(model=model, top_k=top_k)
selector.fit_transform(ts)

Expand All @@ -113,25 +140,25 @@ def test_selected_top_k_regressors(model, top_k, ts_with_regressors):
@pytest.mark.parametrize(
"model",
[
"random_forest",
"catboost",
DecisionTreeRegressor(random_state=42),
ExtraTreeRegressor(random_state=42),
RandomForestRegressor(n_estimators=10, random_state=42),
ExtraTreesRegressor(n_estimators=10, random_state=42),
GradientBoostingRegressor(n_estimators=10, random_state=42),
CatBoostRegressor(iterations=10, random_state=42, silent=True, cat_features=["segment_code"]),
CatBoostRegressor(iterations=10, random_state=42, silent=True),
],
)
@pytest.mark.parametrize("top_k", [0, 1, 5, 15, 50])
def test_retain_values(model, top_k, ts_with_regressors):
"""Check that transform doesn't change values of columns."""
ts = ts_with_regressors
le_encoder = SegmentEncoderTransform()
le_encoder.fit_transform(ts)
df_encoded = ts.to_pandas()
selector = TreeFeatureSelectionTransform(model=model, top_k=top_k)
df_selected = selector.fit_transform(ts).to_pandas()

for segment in ts_with_regressors.segments:
for segment in ts.segments:
for column in df_selected.columns.get_level_values("feature").unique():
assert (
df_selected.loc[:, pd.IndexSlice[segment, column]] == df_encoded.loc[:, pd.IndexSlice[segment, column]]
Expand All @@ -141,6 +168,8 @@ def test_retain_values(model, top_k, ts_with_regressors):
@pytest.mark.parametrize(
"model",
[
"random_forest",
"catboost",
DecisionTreeRegressor(random_state=42),
ExtraTreeRegressor(random_state=42),
RandomForestRegressor(n_estimators=10, random_state=42),
Expand All @@ -158,6 +187,8 @@ def test_fails_negative_top_k(model):
@pytest.mark.parametrize(
"model",
[
"random_forest",
"catboost",
DecisionTreeRegressor(random_state=42),
ExtraTreeRegressor(random_state=42),
RandomForestRegressor(n_estimators=10, random_state=42),
Expand All @@ -178,20 +209,20 @@ def test_warns_no_regressors(model, example_tsds):
@pytest.mark.parametrize(
"model",
[
"random_forest",
"catboost",
DecisionTreeRegressor(random_state=42),
ExtraTreeRegressor(random_state=42),
RandomForestRegressor(n_estimators=10, random_state=42),
ExtraTreesRegressor(n_estimators=10, random_state=42),
GradientBoostingRegressor(n_estimators=10, random_state=42),
CatBoostRegressor(iterations=700, random_state=42, silent=True, cat_features=["segment_code"]),
CatBoostRegressor(iterations=700, random_state=42, silent=True),
],
)
def test_sanity_selected(model, ts_with_regressors):
"""Check that transform correctly finds meaningful regressors."""
ts = ts_with_regressors
le_encoder = SegmentEncoderTransform()
le_encoder.fit_transform(ts)
selector = TreeFeatureSelectionTransform(model=model, top_k=8)
selector = TreeFeatureSelectionTransform(model=model, top_k=10)
df_selected = selector.fit_transform(ts).to_pandas()
features_columns = df_selected.columns.get_level_values("feature").unique()
selected_regressors = [column for column in features_columns if column.startswith("regressor_")]
Expand All @@ -202,6 +233,8 @@ def test_sanity_selected(model, ts_with_regressors):
@pytest.mark.parametrize(
"model",
[
"random_forest",
"catboost",
DecisionTreeRegressor(random_state=42),
ExtraTreeRegressor(random_state=42),
RandomForestRegressor(n_estimators=10, random_state=42),
Expand Down Expand Up @@ -231,6 +264,8 @@ def test_sanity_model(model, ts_with_regressors):
@pytest.mark.parametrize(
"model",
[
"random_forest",
"catboost",
DecisionTreeRegressor(random_state=42),
ExtraTreeRegressor(random_state=42),
RandomForestRegressor(n_estimators=10, random_state=42),
Expand Down Expand Up @@ -286,3 +321,19 @@ def test_mrmr_right_regressors(relevance_table, ts_with_regressors):
)
def test_save_load(transform, ts_with_regressors):
assert_transformation_equals_loaded_original(transform=transform, ts=ts_with_regressors)


@pytest.mark.parametrize(
"transform",
[
TreeFeatureSelectionTransform(model=DecisionTreeRegressor(random_state=42), top_k=3),
MRMRFeatureSelectionTransform(
relevance_table=ModelRelevanceTable(), top_k=3, model=RandomForestRegressor(random_state=42)
),
MRMRFeatureSelectionTransform(relevance_table=StatisticsRelevanceTable(), top_k=3),
],
)
def test_params_to_tune(transform, ts_with_regressors):
ts = ts_with_regressors
assert len(transform.params_to_tune()) > 0
assert_sampling_is_valid(transform=transform, ts=ts)
Original file line number Diff line number Diff line change
Expand Up @@ -201,3 +201,16 @@ def test_inverse_transform_back_included_columns(ts_with_features, columns, retu
)
def test_save_load(transform, ts_with_features):
assert_transformation_equals_loaded_original(transform=transform, ts=ts_with_features)


@pytest.mark.parametrize(
"transform",
[
FilterFeaturesTransform(include=["target"], return_features=True),
FilterFeaturesTransform(include=["target"], return_features=False),
FilterFeaturesTransform(exclude=["exog_1", "exog_2"], return_features=False),
FilterFeaturesTransform(exclude=["exog_1", "exog_2"], return_features=False),
],
)
def test_params_to_tune(transform):
assert len(transform.params_to_tune()) == 0
Loading