Skip to content

Commit

Permalink
Add method for returning list of features from TSDataset (#405)
Browse files Browse the repository at this point in the history
  • Loading branch information
d-a-bunin authored Jun 25, 2024
1 parent bef697e commit ccc5ba4
Show file tree
Hide file tree
Showing 24 changed files with 196 additions and 80 deletions.
4 changes: 2 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Add `IForestOutlierTransform` ([#381](https://github.com/etna-team/etna/pull/381))
- Add `IQROutlierTransform` ([#387](https://github.com/etna-team/etna/pull/387))
- Add `num_workers` parameter to `TS2VecEmbeddingModel` ([#396](https://github.com/etna-team/etna/pull/396))
-
- Add `TSDataset.features` property to get list of all features in a dataset ([#405](https://github.com/etna-team/etna/pull/405))
-

### Changed
Expand All @@ -34,7 +34,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
-
-
-
-
- Update `TSDataset.describe`, `TSDataset.info` to exclude target intervals and target components in `num_exogs` ([#405](https://github.com/etna-team/etna/pull/405))
-
-
-
Expand Down
2 changes: 1 addition & 1 deletion etna/analysis/eda/plots.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def plot_correlation_matrix(
if segments is None:
segments = sorted(ts.segments)
if columns is None:
columns = list(set(ts.df.columns.get_level_values("feature")))
columns = ts.features
if "vmin" not in heatmap_kwargs:
heatmap_kwargs["vmin"] = -1
if "vmax" not in heatmap_kwargs:
Expand Down
2 changes: 1 addition & 1 deletion etna/analysis/eda/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def get_correlation_matrix(
if segments is None:
segments = sorted(ts.segments)
if columns is None:
columns = list(set(ts.df.columns.get_level_values("feature")))
columns = ts.features

correlation_matrix = ts[:, segments, columns].corr(method=method).values
return correlation_matrix
Expand Down
2 changes: 1 addition & 1 deletion etna/analysis/feature_relevance/plots.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def plot_feature_relevance(
if segments is None:
segments = sorted(ts.segments)
border_value = None
features = list(set(ts.columns.get_level_values("feature")) - {"target"})
features = list(set(ts.features) - {"target"})
relevance_df = relevance_table(df=ts[:, segments, "target"], df_exog=ts[:, segments, features], **relevance_params)
if relevance_aggregation_mode == "per-segment":
_, ax = _prepare_axes(num_plots=len(segments), columns_num=columns_num, figsize=figsize)
Expand Down
4 changes: 2 additions & 2 deletions etna/analysis/forecast/plots.py
Original file line number Diff line number Diff line change
Expand Up @@ -898,9 +898,9 @@ def plot_forecast_decomposition(
components_mode = ComponentsMode(mode)

if segments is None:
segments = list(forecast_ts.columns.get_level_values("segment").unique())
segments = forecast_ts.segments

column_names = set(forecast_ts.columns.get_level_values("feature"))
column_names = set(forecast_ts.features)
components = list(match_target_components(column_names))

if len(components) == 0:
Expand Down
2 changes: 1 addition & 1 deletion etna/analysis/outliers/isolation_forest_outliers.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
def _select_features(
ts: TSDataset, in_column: str, features_to_use: Optional[Sequence[str]], features_to_ignore: Optional[Sequence[str]]
) -> pd.DataFrame:
features = ts.columns.get_level_values("feature")
features = ts.features
if in_column not in features:
raise ValueError(f"Feature {in_column} is not present in the dataset.")

Expand Down
26 changes: 24 additions & 2 deletions etna/datasets/tsdataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -679,6 +679,22 @@ def regressors(self) -> List[str]:
"""
return self._regressors

@property
def features(self) -> List[str]:
"""Get list of all features across all segments in dataset.
All features include initial exogenous data, generated features, target, target components, prediction intervals.
The order of features in returned list isn't specified.
If different segments have different subset of features, then the union of features is returned.
Returns
-------
:
List of features.
"""
return self.df.columns.get_level_values("feature").unique().tolist()

@property
def target_components_names(self) -> Tuple[str, ...]:
"""Get tuple with target components names. Components sum up to target. Return the empty tuple in case of components absence."""
Expand Down Expand Up @@ -886,7 +902,7 @@ def to_pandas(self, flatten: bool = False, features: Union[Literal["all"], Seque
if features == "all":
return self.df.copy()
raise ValueError("The only possible literal is 'all'")
segments = self.columns.get_level_values("segment").unique().tolist()
segments = self.segments
return self.df.loc[:, self.idx[segments, features]].copy()
return self.to_flatten(self.df, features=features)

Expand Down Expand Up @@ -1590,9 +1606,15 @@ def tail(self, n_rows: int = 5) -> pd.DataFrame:

def _gather_common_data(self) -> Dict[str, Any]:
"""Gather information about dataset in general."""
features = set(self.features)
exogs = (
features.difference({"target"})
.difference(self.prediction_intervals_names)
.difference(self.target_components_names)
)
common_dict: Dict[str, Any] = {
"num_segments": len(self.segments),
"num_exogs": self.df.columns.get_level_values("feature").difference(["target"]).nunique(),
"num_exogs": len(exogs),
"num_regressors": len(self.regressors),
"num_known_future": len(self.known_future),
"freq": self.freq,
Expand Down
2 changes: 1 addition & 1 deletion etna/models/deadline_ma.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ def get_model(self) -> "DeadlineMovingAverageModel":
return self

def _check_not_used_columns(self, ts: TSDataset):
columns = set(ts.columns.get_level_values("feature"))
columns = set(ts.features)
columns_not_used = columns.difference({"target"})
if columns_not_used:
warnings.warn(
Expand Down
2 changes: 1 addition & 1 deletion etna/models/seasonal_ma.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def get_model(self) -> "SeasonalMovingAverageModel":
return self

def _check_not_used_columns(self, ts: TSDataset):
columns = set(ts.columns.get_level_values("feature"))
columns = set(ts.features)
columns_not_used = columns.difference({"target"})
if columns_not_used:
warnings.warn(
Expand Down
2 changes: 1 addition & 1 deletion etna/transforms/outliers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ def fit(self, ts: TSDataset) -> "OutliersTransform":
The fitted transform instance.
"""
if self.ignore_flag_column is not None:
if self.ignore_flag_column not in ts.columns.get_level_values("feature"):
if self.ignore_flag_column not in ts.features:
raise ValueError(f'Name ignore_flag_column="{self.ignore_flag_column}" not find.')
types_ignore_flag = ts[..., self.ignore_flag_column].isin([0, 1]).all(axis=0)
if not all(types_ignore_flag):
Expand Down
2 changes: 1 addition & 1 deletion examples/103-EDA.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -1136,7 +1136,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.6"
"version": "3.10.13"
},
"vscode": {
"interpreter": {
Expand Down
2 changes: 1 addition & 1 deletion examples/201-exogenous_data.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -1477,7 +1477,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.6"
"version": "3.10.13"
}
},
"nbformat": 4,
Expand Down
103 changes: 65 additions & 38 deletions examples/207-feature_selection.ipynb

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def test_create_ts_by_column_interface(outliers_tsds, column):
new_ts = create_ts_by_column(outliers_tsds, column)
assert isinstance(new_ts, TSDataset)
assert outliers_tsds.segments == new_ts.segments
assert new_ts.columns.get_level_values("feature").unique().tolist() == ["target"]
assert new_ts.features == ["target"]


@pytest.mark.parametrize("column", ["exog"])
Expand Down
81 changes: 74 additions & 7 deletions tests/test_datasets/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,38 @@ def ts_info() -> TSDataset:
return ts


@pytest.fixture
def ts_info_with_components_and_quantiles() -> TSDataset:
timestamp = pd.date_range("2021-01-01", "2021-02-01")
df_1 = pd.DataFrame({"timestamp": timestamp, "target": 11, "segment": "1"})
df_2 = pd.DataFrame({"timestamp": timestamp, "target": 12, "segment": "2"})
df_3 = pd.DataFrame({"timestamp": timestamp, "target": 13, "segment": "3"})
df = pd.concat([df_1, df_2, df_3], ignore_index=True)
df = TSDataset.to_dataset(df)

ts = TSDataset(df=df, freq="D")

intervals_df = pd.concat(
[
df.rename({"target": "target_0.025"}, axis=1, level="feature") - 1,
df.rename({"target": "target_0.975"}, axis=1, level="feature") + 1,
],
axis=1,
)
ts.add_prediction_intervals(intervals_df)

components_df = pd.concat(
[
df.rename({"target": "target_a"}, axis=1, level="feature") / 2,
df.rename({"target": "target_b"}, axis=1, level="feature") / 2,
],
axis=1,
)
ts.add_target_components(components_df)

return ts


@pytest.fixture
def df_update_add_column() -> pd.DataFrame:
timestamp = pd.date_range("2021-01-01", "2021-02-12")
Expand Down Expand Up @@ -327,6 +359,14 @@ def ts_with_prediction_intervals(ts_without_target_components, prediction_interv
return ts


@pytest.fixture()
def ts_after_transform(example_tsds):
ts = example_tsds
transform = AddConstTransform(in_column="target", value=0, inplace=False, out_column="add_target")
ts.fit_transform(transforms=[transform])
return ts


def test_create_ts_with_datetime_timestamp():
freq = "D"
df = generate_ar_df(periods=10, freq=freq, n_segments=3)
Expand Down Expand Up @@ -1337,14 +1377,25 @@ def test_fit_transform_raise_warning_on_diff_endings(ts_diff_endings):
ts_diff_endings.fit_transform([])


def test_gather_common_data(ts_info):
@pytest.mark.parametrize(
"ts_name, expected_answer",
[
("ts_info", {"num_segments": 3, "num_exogs": 2, "num_regressors": 2, "num_known_future": 2, "freq": "D"}),
(
"ts_info_with_components_and_quantiles",
{"num_segments": 3, "num_exogs": 0, "num_regressors": 0, "num_known_future": 0, "freq": "D"},
),
],
)
def test_gather_common_data(ts_name, expected_answer, request):
"""Check that TSDataset._gather_common_data correctly finds common data for info/describe methods."""
common_data = ts_info._gather_common_data()
assert common_data["num_segments"] == 3
assert common_data["num_exogs"] == 2
assert common_data["num_regressors"] == 2
assert common_data["num_known_future"] == 2
assert common_data["freq"] == "D"
ts = request.getfixturevalue(ts_name)
common_data = ts._gather_common_data()
assert common_data["num_segments"] == expected_answer["num_segments"]
assert common_data["num_exogs"] == expected_answer["num_exogs"]
assert common_data["num_regressors"] == expected_answer["num_regressors"]
assert common_data["num_known_future"] == expected_answer["num_known_future"]
assert common_data["freq"] == expected_answer["freq"]


def test_gather_segments_data(ts_info):
Expand Down Expand Up @@ -1925,3 +1976,19 @@ def test_create_from_misaligned_fail_name_intersection(
future_steps=future_steps,
known_future=known_future,
)


@pytest.mark.parametrize(
"ts_name, expected_features",
[
("example_tsds", ["target"]),
("tsdf_with_exog", ["target", "exog"]),
("ts_after_transform", ["target", "add_target"]),
("ts_with_prediction_intervals", ["target", "target_0.1", "target_0.9"]),
("ts_with_target_components", ["target", "target_component_a", "target_component_b"]),
],
)
def test_features(ts_name, expected_features, request):
ts = request.getfixturevalue(ts_name)
features = ts.features
assert sorted(features) == sorted(expected_features)
4 changes: 2 additions & 2 deletions tests/test_datasets/test_hierarchical_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,7 +256,7 @@ def test_init_df_same_level_df_exog(
):
df, df_exog = market_level_df, market_level_df_exog
ts = TSDataset(df=df, freq="D", df_exog=df_exog, hierarchical_structure=hierarchical_structure)
df_columns = set(ts.columns.get_level_values("feature"))
df_columns = set(ts.features)
assert df_columns == expected_columns


Expand All @@ -265,7 +265,7 @@ def test_init_df_different_level_df_exog(
):
df, df_exog = product_level_df, market_level_df_exog
ts = TSDataset(df=df, freq="D", df_exog=df_exog, hierarchical_structure=hierarchical_structure)
df_columns = set(ts.columns.get_level_values("feature"))
df_columns = set(ts.features)
assert df_columns == expected_columns


Expand Down
4 changes: 2 additions & 2 deletions tests/test_transforms/test_base/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ def test_transform_request_correct_columns(remove_columns_ts, required_features)
)
def test_transform_request_update_dataset(remove_columns_ts, required_features):
ts, _ = remove_columns_ts
columns_before = set(ts.columns.get_level_values("feature"))
columns_before = set(ts.features)
ts.to_pandas = Mock(return_value=ts.df)

transform = TransformMock(required_features=required_features)
Expand All @@ -164,7 +164,7 @@ def test_inverse_transform_add_target_quantiles(remove_columns_ts, in_column, ex

def test_inverse_transform_request_update_dataset(remove_columns_ts):
ts, _ = remove_columns_ts
columns_before = set(ts.columns.get_level_values("feature"))
columns_before = set(ts.features)
ts.to_pandas = Mock(return_value=ts.df)

transform = ReversibleTransformMock(required_features="all")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def test_segment_encoder_transform(simple_ts):
), "Number of columns not the same as segments"
assert len(simple_ts.to_pandas()) == len(transformed_df), "Row missing"
codes = set()
for segment in simple_ts.columns.get_level_values("segment").unique():
for segment in simple_ts.segments:
column = transformed_df.loc[:, pd.IndexSlice[segment, "segment_code"]]
assert column.dtype == "category", "Column type is not category"
assert np.all(column == column.iloc[0]), "Values are not the same for the whole column"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ def test_selected_top_k_regressors(model, top_k, ts_with_regressors):
selector = TreeFeatureSelectionTransform(model=model, top_k=top_k)
selector.fit_transform(ts)

selected_regressors = set(ts.columns.get_level_values("feature")).difference({"target"})
selected_regressors = set(ts.features).difference({"target"})
assert len(selected_regressors) == min(len(all_regressors), top_k)


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -658,7 +658,7 @@ def test_right_number_features_with_integer_division(ts_with_exog_galeshapley):
transform.fit(ts_with_exog_galeshapley)
ts = transform.transform(ts_with_exog_galeshapley)

remaining_columns = ts.columns.get_level_values("feature").unique().tolist()
remaining_columns = ts.features
assert len(remaining_columns) == top_k + 1


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def test_fit(ts, request):
in_column="regressor_exog", inplace=True, distribution_column="target", out_column=None
)
resampler.fit(ts)
segments = ts.df.columns.get_level_values("segment").unique()
segments = ts.segments
for segment in segments:
assert (resampler.segment_transforms[segment].distribution == expected_distribution[segment]).all().all()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -191,12 +191,12 @@ def test_repr():
def test_interface_correct_args_out_column(in_column: Optional[str], true_params: List[str], train_ts: TSDataset):
"""Test that transform generates correct column names using out_column parameter."""
init_params = deepcopy(INIT_PARAMS_TEMPLATE)
segments = train_ts.columns.get_level_values("segment").unique()
segments = train_ts.segments
out_column = "dateflags"
for key in true_params:
init_params[key] = True
transform = DateFlagsTransform(**init_params, out_column=out_column, in_column=in_column)
initial_columns = train_ts.columns.get_level_values("feature").unique()
initial_columns = train_ts.features

result = transform.fit_transform(train_ts).to_pandas()

Expand Down Expand Up @@ -244,14 +244,14 @@ def test_interface_correct_args_out_column(in_column: Optional[str], true_params
def test_interface_correct_args_repr(in_column: Optional[str], true_params: List[str], train_ts: TSDataset):
"""Test that transform generates correct column names without setting out_column parameter."""
init_params = deepcopy(INIT_PARAMS_TEMPLATE)
segments = train_ts.columns.get_level_values("segment").unique()
segments = train_ts.segments
for key in true_params:
if key in SPECIAL_DAYS_PARAMS:
init_params[key] = SPECIAL_DAYS
else:
init_params[key] = True
transform = DateFlagsTransform(**init_params, in_column=in_column)
initial_columns = train_ts.columns.get_level_values("feature").unique()
initial_columns = train_ts.features

result = transform.fit_transform(deepcopy(train_ts)).to_pandas()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -198,8 +198,8 @@ def test_fail_set_both():
)
def test_column_names(example_ts, period, order, num_columns):
"""Test that transform creates expected number of columns and they can be recreated by its name."""
segments = example_ts.columns.get_level_values("segment").unique()
initial_columns = example_ts.columns.get_level_values("feature").unique()
segments = example_ts.segments
initial_columns = example_ts.features
transform = FourierTransform(period=period, order=order)

transformed_df = transform.fit_transform(deepcopy(example_ts)).to_pandas()
Expand All @@ -222,7 +222,7 @@ def test_column_names(example_ts, period, order, num_columns):

def test_column_names_out_column(example_ts):
"""Test that transform creates expected columns if `out_column` is set"""
initial_columns = example_ts.columns.get_level_values("feature").unique()
initial_columns = example_ts.features
transform = FourierTransform(period=10, order=3, out_column="regressor_fourier")
transformed_df = transform.fit_transform(example_ts).to_pandas()
columns = transformed_df.columns.get_level_values("feature").unique().difference(initial_columns)
Expand Down
Loading

0 comments on commit ccc5ba4

Please sign in to comment.