Skip to content

Commit

Permalink
Fix target leakage in MeanSegmentEncoderTransform (#503)
Browse files Browse the repository at this point in the history
* fix MeanEncoder

* update changelog

* fix segment encoder tests

* make attributes private

---------

Co-authored-by: Egor Baturin <[email protected]>
  • Loading branch information
egoriyaa and Egor Baturin authored Nov 9, 2024
1 parent 4a6e975 commit e65fa1b
Show file tree
Hide file tree
Showing 6 changed files with 81 additions and 136 deletions.
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Optimize memory usage in `TFTNativeModel` by eliminating copying during making samples ([#494](https://github.com/etna-team/etna/pull/494))
- Optimize memory usage in `DeepStateModel` and `DeepARNativeModel` by eliminating copying during making samples ([#499](https://github.com/etna-team/etna/pull/499))
- Fix working with NaN target in `MeanEncoderTransform` ([#492](https://github.com/etna-team/etna/pull/492))
-
- Fix `target` leakage in `MeanSegmentEncoderTransform` ([#503](https://github.com/etna-team/etna/pull/503))
-
-
-
Expand Down
54 changes: 24 additions & 30 deletions etna/transforms/encoders/mean_segment_encoder.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,34 @@
import reprlib
from typing import Dict
from typing import List
from typing import Optional

import numpy as np
import pandas as pd

from etna.transforms import IrreversibleTransform
from etna.transforms.math.statistics import MeanTransform
from etna.transforms.encoders.mean_encoder import MeanEncoderTransform


class MeanSegmentEncoderTransform(IrreversibleTransform):
"""Makes expanding mean target encoding of the segment. Creates column 'segment_mean'."""

idx = pd.IndexSlice
_segment_column = "segment_column"
out_column = "segment_mean"

def __init__(self):
super().__init__(required_features=["target"])
self.mean_encoder = MeanTransform(in_column="target", window=-1, out_column="segment_mean")
self.global_means: Optional[Dict[str, float]] = None
self._mean_encoder = MeanEncoderTransform(
in_column=self._segment_column, mode="per-segment", out_column=self.out_column, smoothing=0
)

def _add_segment_column(self, df):
segments = df.columns.get_level_values("segment").unique()
flatten_segments = np.repeat(segments.values[np.newaxis, :], len(df), axis=0)
segment_values = pd.DataFrame(
data=flatten_segments,
columns=pd.MultiIndex.from_product([segments, [self._segment_column]]),
index=df.index,
)
df = pd.concat([df, segment_values], axis=1).sort_index(axis=1)
return df

def _fit(self, df: pd.DataFrame) -> "MeanSegmentEncoderTransform":
"""
Expand All @@ -34,10 +44,8 @@ def _fit(self, df: pd.DataFrame) -> "MeanSegmentEncoderTransform":
:
Fitted transform
"""
self.mean_encoder._fit(df)
mean_values = df.loc[:, self.idx[:, "target"]].mean().to_dict()
mean_values = {key[0]: value for key, value in mean_values.items()}
self.global_means = mean_values
df = self._add_segment_column(df)
self._mean_encoder._fit(df)
return self

def _transform(self, df: pd.DataFrame) -> pd.DataFrame:
Expand All @@ -61,25 +69,11 @@ def _transform(self, df: pd.DataFrame) -> pd.DataFrame:
NotImplementedError:
If there are segments that weren't present during training.
"""
if self.global_means is None:
raise ValueError("The transform isn't fitted!")

segments = df.columns.get_level_values("segment").unique().tolist()
new_segments = set(segments) - self.global_means.keys()
if len(new_segments) > 0:
raise NotImplementedError(
f"This transform can't process segments that weren't present on train data: {reprlib.repr(new_segments)}"
)

df = self.mean_encoder._transform(df)
segment = segments[0]
nan_timestamps = df[df.loc[:, self.idx[segment, "target"]].isna()].index
values_to_set = np.array([self.global_means[x] for x in segments])
# repetition isn't necessary for pandas >= 1.2
values_to_set = np.repeat(values_to_set[np.newaxis, :], len(nan_timestamps), axis=0)
df.loc[nan_timestamps, self.idx[:, "segment_mean"]] = values_to_set
return df
df = self._add_segment_column(df)
df_transformed = self._mean_encoder._transform(df)
df_transformed = df_transformed.drop(columns=[self._segment_column], level="feature")
return df_transformed

def get_regressors_info(self) -> List[str]:
"""Return the list with regressors created by the transform."""
return ["segment_mean"]
return [self.out_column]
50 changes: 23 additions & 27 deletions tests/test_transforms/test_encoders/conftest.py
Original file line number Diff line number Diff line change
@@ -1,38 +1,34 @@
import numpy as np
import pandas as pd
import pytest

from etna.datasets import TSDataset
from etna.datasets import generate_ar_df


@pytest.fixture
def simple_ts() -> TSDataset:
df_1 = pd.DataFrame.from_dict({"timestamp": pd.date_range("2021-06-01", "2021-06-07", freq="D")})
df_2 = pd.DataFrame.from_dict({"timestamp": pd.date_range("2021-06-01", "2021-06-07", freq="D")})
df_1["segment"] = "Moscow"
df_1["target"] = [1.0, 2.0, 3.0, 4.0, 5.0, np.NAN, np.NAN]
df_1["exog"] = [6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0]
df_2["segment"] = "Omsk"
df_2["target"] = [10.0, 20.0, 30.0, 40.0, 50.0, np.NAN, np.NAN]
df_2["exog"] = [60.0, 70.0, 80.0, 90.0, 100.0, 110.0, 120.0]
classic_df = pd.concat([df_1, df_2], ignore_index=True)
df = TSDataset.to_dataset(classic_df)
ts = TSDataset(df, freq="D")
def mean_segment_encoder_ts() -> TSDataset:
df = generate_ar_df(n_segments=2, start_time="2001-01-01", periods=5)
df["target"] = [0.0, 1.0, np.NaN, 3.0, 4.0] + [np.NaN, 1.0, 2.0, 3.0, 4.0]

ts = TSDataset(df=df, freq="D")
return ts


@pytest.fixture
def expected_mean_segment_encoder_ts() -> TSDataset:
df = generate_ar_df(n_segments=2, start_time="2001-01-01", periods=5)
df["target"] = [0.0, 1.0, np.NaN, 3.0, 4.0] + [np.NaN, 1.0, 2.0, 3.0, 4.0]
df["segment_mean"] = [np.NaN, 0, 0.5, 0.5, 1.33] + [np.NaN, np.NaN, 1, 1.5, 2.0]

ts = TSDataset(df=df, freq="D")
return ts


@pytest.fixture
def transformed_simple_df() -> pd.DataFrame:
df_1 = pd.DataFrame.from_dict({"timestamp": pd.date_range("2021-06-01", "2021-06-07", freq="D")})
df_2 = pd.DataFrame.from_dict({"timestamp": pd.date_range("2021-06-01", "2021-06-07", freq="D")})
df_1["segment"] = "Moscow"
df_1["target"] = [1.0, 2.0, 3.0, 4.0, 5.0, np.NAN, np.NAN]
df_1["exog"] = [6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0]
df_1["segment_mean"] = [1, 1.5, 2, 2.5, 3, 3, 3]
df_2["segment"] = "Omsk"
df_2["target"] = [10.0, 20.0, 30.0, 40.0, 50.0, np.NAN, np.NAN]
df_2["exog"] = [60.0, 70.0, 80.0, 90.0, 100.0, 110.0, 120.0]
df_2["segment_mean"] = [10.0, 15.0, 20.0, 25.0, 30, 30, 30]
classic_df = pd.concat([df_1, df_2], ignore_index=True)
df = TSDataset.to_dataset(classic_df)
return df
def expected_make_future_mean_segment_encoder_ts() -> TSDataset:
df = generate_ar_df(start_time="2001-01-06", periods=2, n_segments=2)
df["target"] = [np.NaN, np.NaN] + [np.NaN, np.NaN]
df["segment_mean"] = [2.0, 2.0] + [2.5, 2.5]

ts = TSDataset(df=df, freq="D")
return ts
31 changes: 8 additions & 23 deletions tests/test_transforms/test_encoders/test_mean_encoder_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@ def expected_micro_category_ts() -> TSDataset:
df = generate_ar_df(start_time="2001-01-01", periods=6, n_segments=2)
df.rename(columns={"target": "mean_encoded_regressor"}, inplace=True)
df["mean_encoded_regressor"] = [np.NaN, np.NaN, np.NaN, 1.5, 2.75, 2.25] + [np.NaN, np.NaN, 6.25, 7, 7.625, np.NaN]

ts = TSDataset(df, freq="D")
return ts

Expand Down Expand Up @@ -151,28 +150,14 @@ def expected_multiple_nan_target_category_ts() -> TSDataset:


@pytest.fixture
def mean_segment_encoder_ts() -> TSDataset:
df = generate_ar_df(n_segments=1, start_time="2001-01-01", periods=5)
df["target"] = [0, 1, np.NaN, 3, 4]

df_exog = generate_ar_df(n_segments=1, start_time="2001-01-01", periods=10)
df_exog.rename(columns={"target": "segment_feature"}, inplace=True)
df_exog["segment_feature"] = "segment_0"

ts = TSDataset(df=df, df_exog=df_exog, freq="D", known_future="all")

return ts


@pytest.fixture
def expected_mean_segment_encoder_ts() -> TSDataset:
df = generate_ar_df(n_segments=1, start_time="2001-01-01", periods=5)
df.rename(columns={"target": "segment_mean"}, inplace=True)
df["segment_mean"] = [np.NaN, 0, 0.5, 0.5, 1.33]
def mean_segment_encoder_ts(mean_segment_encoder_ts) -> TSDataset:
df = generate_ar_df(n_segments=2, start_time="2001-01-01", periods=7)
df = df.drop(columns=["target"])
df["segment_feature"] = ["segment_0"] * 7 + ["segment_1"] * 7
df_wide = TSDataset.to_dataset(df)
mean_segment_encoder_ts.add_columns_from_pandas(df_wide, update_exog=True, regressors=["segment_feature"])

ts = TSDataset(df=df, freq="D")

return ts
return mean_segment_encoder_ts


@pytest.fixture
Expand Down Expand Up @@ -407,7 +392,7 @@ def test_mean_segment_encoder(mean_segment_encoder_ts, expected_mean_segment_enc
mean_encoder.fit_transform(mean_segment_encoder_ts)
assert_frame_equal(
mean_segment_encoder_ts.df.loc[:, pd.IndexSlice[:, "segment_mean"]],
expected_mean_segment_encoder_ts.df,
expected_mean_segment_encoder_ts.df.loc[:, pd.IndexSlice[:, "segment_mean"]],
atol=0.01,
)

Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import numpy as np
import pandas as pd
import pytest
from pandas.testing import assert_frame_equal

from etna.datasets import TSDataset
from etna.metrics import R2
Expand All @@ -10,44 +11,31 @@
from tests.utils import select_segments_subset


@pytest.mark.parametrize("expected_global_means", [{"Moscow": 3, "Omsk": 30}])
def test_mean_segment_encoder_fit(simple_ts, expected_global_means):
def test_mean_segment_encoder_transform(mean_segment_encoder_ts, expected_mean_segment_encoder_ts):
encoder = MeanSegmentEncoderTransform()
encoder.fit(simple_ts)
assert encoder.global_means == expected_global_means
transformed_df = encoder.fit_transform(mean_segment_encoder_ts).to_pandas()
assert_frame_equal(transformed_df, expected_mean_segment_encoder_ts.to_pandas(), atol=0.01)


def test_mean_segment_encoder_transform(simple_ts, transformed_simple_df):
encoder = MeanSegmentEncoderTransform()
transformed_df = encoder.fit_transform(simple_ts).to_pandas()
transformed_simple_df.index.freq = "D"
pd.testing.assert_frame_equal(transformed_simple_df, transformed_df)


def test_subset_segments(simple_ts):
train_ts = simple_ts
test_df = simple_ts.loc[:, pd.IndexSlice["Omsk", :]]
test_ts = TSDataset(df=test_df, freq=simple_ts.freq)
transform = MeanSegmentEncoderTransform()

transform.fit(train_ts)
transformed_test_df = transform.transform(test_ts).to_pandas()
def test_make_future_mean_segment_encoder_transform(
mean_segment_encoder_ts, expected_make_future_mean_segment_encoder_ts
):
mean_segment_encoder = MeanSegmentEncoderTransform()
mean_segment_encoder.fit_transform(mean_segment_encoder_ts)
future_ts = mean_segment_encoder_ts.make_future(future_steps=2, transforms=[mean_segment_encoder])

segments = sorted(transformed_test_df.columns.get_level_values("segment").unique())
features = sorted(transformed_test_df.columns.get_level_values("feature").unique())
assert segments == ["Omsk"]
assert features == ["exog", "segment_mean", "target"]
assert_frame_equal(future_ts.to_pandas(), expected_make_future_mean_segment_encoder_ts.to_pandas())


def test_not_fitted_error(simple_ts):
def test_not_fitted_error(mean_segment_encoder_ts):
encoder = MeanSegmentEncoderTransform()
with pytest.raises(ValueError, match="The transform isn't fitted"):
encoder.transform(simple_ts)
encoder.transform(mean_segment_encoder_ts)


def test_new_segments_error(simple_ts):
train_ts = select_segments_subset(ts=simple_ts, segments=["Moscow"])
test_ts = select_segments_subset(ts=simple_ts, segments=["Omsk"])
def test_new_segments_error(mean_segment_encoder_ts):
train_ts = select_segments_subset(ts=mean_segment_encoder_ts, segments=["segment_0"])
test_ts = select_segments_subset(ts=mean_segment_encoder_ts, segments=["segment_1"])
transform = MeanSegmentEncoderTransform()

transform.fit(train_ts)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,54 +2,36 @@
import pandas as pd
import pytest

from etna.datasets import TSDataset
from etna.transforms import SegmentEncoderTransform
from tests.test_transforms.utils import assert_transformation_equals_loaded_original
from tests.utils import select_segments_subset


def test_segment_encoder_transform(simple_ts):
def test_segment_encoder_transform(mean_segment_encoder_ts):
transform = SegmentEncoderTransform()
transformed_df = transform.fit_transform(simple_ts).to_pandas()
transformed_df = transform.fit_transform(mean_segment_encoder_ts).to_pandas()
assert (
len(transformed_df.loc[:, pd.IndexSlice[:, "segment_code"]].columns) == 2
), "Number of columns not the same as segments"
assert len(simple_ts.to_pandas()) == len(transformed_df), "Row missing"
assert len(mean_segment_encoder_ts.to_pandas()) == len(transformed_df), "Row missing"
codes = set()
for segment in simple_ts.segments:
for segment in mean_segment_encoder_ts.segments:
column = transformed_df.loc[:, pd.IndexSlice[segment, "segment_code"]]
assert column.dtype == "category", "Column type is not category"
assert np.all(column == column.iloc[0]), "Values are not the same for the whole column"
codes.add(column.iloc[0])
assert codes == {0, 1}, "Codes are not 0 and 1"


def test_subset_segments(simple_ts):
train_ts = simple_ts
test_df = simple_ts.loc[:, pd.IndexSlice["Omsk", :]]
test_ts = TSDataset(df=test_df, freq=simple_ts.freq)
transform = SegmentEncoderTransform()

transform.fit(train_ts)
transformed_test_df = transform.transform(test_ts).to_pandas()

segments = sorted(transformed_test_df.columns.get_level_values("segment").unique())
features = sorted(transformed_test_df.columns.get_level_values("feature").unique())
assert segments == ["Omsk"]
assert features == ["exog", "segment_code", "target"]
values = transformed_test_df.loc[:, pd.IndexSlice[:, "segment_code"]]
assert np.all(values == values.iloc[0])


def test_not_fitted_error(simple_ts):
def test_not_fitted_error(mean_segment_encoder_ts):
encoder = SegmentEncoderTransform()
with pytest.raises(ValueError, match="The transform isn't fitted"):
encoder.transform(simple_ts)
encoder.transform(mean_segment_encoder_ts)


def test_new_segments_error(simple_ts):
train_ts = select_segments_subset(ts=simple_ts, segments=["Moscow"])
test_ts = select_segments_subset(ts=simple_ts, segments=["Omsk"])
def test_new_segments_error(mean_segment_encoder_ts):
train_ts = select_segments_subset(ts=mean_segment_encoder_ts, segments=["segment_0"])
test_ts = select_segments_subset(ts=mean_segment_encoder_ts, segments=["segment_1"])
transform = SegmentEncoderTransform()

transform.fit(train_ts)
Expand Down

0 comments on commit e65fa1b

Please sign in to comment.