diff --git a/CHANGELOG.md b/CHANGELOG.md index fdfdb4973..c49a55815 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -39,7 +39,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - - - -- +- Fix `TSDataset.train_test_split` to pass all features to train and test parts ([#545](https://github.com/etna-team/etna/pull/545)) - ## [2.10.0] - 2025-01-09 diff --git a/etna/datasets/tsdataset.py b/etna/datasets/tsdataset.py index 3488f9ea2..b96213153 100644 --- a/etna/datasets/tsdataset.py +++ b/etna/datasets/tsdataset.py @@ -1148,6 +1148,9 @@ def train_test_split( In case of inconsistencies between ``test_size`` and (``test_start``, ``test_end``), ``test_size`` is ignored + During splitting all the features are kept in train and test parts including target, regressors, + target components, prediction intervals. + Parameters ---------- train_start: @@ -1210,33 +1213,47 @@ def train_test_split( if train_start_defined < self.df.index.min(): warnings.warn(f"Min timestamp in df is {self.df.index.min()}.") - train_df = self.df.loc[train_start_defined:train_end_defined][self.raw_df.columns] # type: ignore - train_raw_df = self.raw_df.loc[train_start_defined:train_end_defined] # type: ignore - train = TSDataset( - df=train_df, - df_exog=self.df_exog, - freq=self.freq, - known_future=self.known_future, - hierarchical_structure=self.hierarchical_structure, - ) - train.raw_df = train_raw_df - train._regressors = deepcopy(self.regressors) - train._target_components_names = deepcopy(self.target_components_names) - train._prediction_intervals_names = deepcopy(self._prediction_intervals_names) - - test_df = self.df.loc[test_start_defined:test_end_defined][self.raw_df.columns] # type: ignore - test_raw_df = self.raw_df.loc[train_start_defined:test_end_defined] # type: ignore - test = TSDataset( - df=test_df, - df_exog=self.df_exog, - freq=self.freq, - known_future=self.known_future, - hierarchical_structure=self.hierarchical_structure, - ) - test.raw_df = test_raw_df - test._regressors = deepcopy(self.regressors) - test._target_components_names = deepcopy(self.target_components_names) - test._prediction_intervals_names = deepcopy(self._prediction_intervals_names) + self_df = self.df + self_raw_df = self.raw_df + try: + # we do this to avoid redundant copying of data + self.df = None + self.raw_df = None + train = deepcopy(self) + + # we want to make sure it makes only one copy + train_df = self_df.loc[train_start_defined:train_end_defined] + if train_df._is_view or train_df._is_copy is not None: + train.df = train_df.copy() + else: + train.df = train_df + + # we want to make sure it makes only one copy + train_raw_df = self_raw_df.loc[train_start_defined:train_end_defined] + if train_raw_df._is_view or train_raw_df._is_copy is not None: + train.raw_df = train_raw_df.copy() + else: + train.raw_df = train_raw_df + + # we want to make sure it makes only one copy + test = deepcopy(self) + test_df = self_df.loc[test_start_defined:test_end_defined] + if test_df._is_view or test_df._is_copy is not None: + test.df = test_df.copy() + else: + test.df = test_df + + # we want to make sure it makes only one copy + test_raw_df = self_raw_df.loc[train_start_defined:test_end_defined] + if test_raw_df._is_view or test_raw_df._is_copy is not None: + test.raw_df = test_raw_df.copy() + else: + test.raw_df = test_raw_df + + finally: + self.df = self_df + self.raw_df = self_raw_df + return train, test def update_columns_from_pandas(self, df_update: pd.DataFrame): diff --git a/etna/ensembles/direct_ensemble.py b/etna/ensembles/direct_ensemble.py index de75c185e..dfae171f4 100644 --- a/etna/ensembles/direct_ensemble.py +++ b/etna/ensembles/direct_ensemble.py @@ -1,4 +1,3 @@ -from copy import deepcopy from typing import Any from typing import Dict from typing import List @@ -100,6 +99,10 @@ def _get_horizon(pipelines: List[BasePipeline]) -> int: def fit(self, ts: TSDataset, save_ts: bool = True) -> "DirectEnsemble": """Fit pipelines in ensemble. + Method doesn't change the given ``ts``. + + Saved ``ts`` is the link to given ``ts``. + Parameters ---------- ts: @@ -113,7 +116,7 @@ def fit(self, ts: TSDataset, save_ts: bool = True) -> "DirectEnsemble": Fitted ensemble """ self.pipelines = Parallel(n_jobs=self.n_jobs, **self.joblib_params)( - delayed(self._fit_pipeline)(pipeline=pipeline, ts=deepcopy(ts)) for pipeline in self.pipelines + delayed(self._fit_pipeline)(pipeline=pipeline, ts=ts) for pipeline in self.pipelines ) if save_ts: diff --git a/etna/ensembles/stacking_ensemble.py b/etna/ensembles/stacking_ensemble.py index ae48416d4..d91390355 100644 --- a/etna/ensembles/stacking_ensemble.py +++ b/etna/ensembles/stacking_ensemble.py @@ -1,5 +1,4 @@ import warnings -from copy import deepcopy from typing import Any from typing import Dict from typing import List @@ -143,6 +142,10 @@ def _backtest_pipeline(self, pipeline: BasePipeline, ts: TSDataset) -> pd.DataFr def fit(self, ts: TSDataset, save_ts: bool = True) -> "StackingEnsemble": """Fit the ensemble. + Method doesn't change the given ``ts``. + + Saved ``ts`` is the link to given ``ts``. + Parameters ---------- ts: @@ -157,7 +160,7 @@ def fit(self, ts: TSDataset, save_ts: bool = True) -> "StackingEnsemble": """ # Get forecasts from base models on backtest to fit the final model on forecasts = Parallel(n_jobs=self.n_jobs, **self.joblib_params)( - delayed(self._backtest_pipeline)(pipeline=pipeline, ts=deepcopy(ts)) for pipeline in self.pipelines + delayed(self._backtest_pipeline)(pipeline=pipeline, ts=ts) for pipeline in self.pipelines ) # Fit the final model @@ -167,7 +170,7 @@ def fit(self, ts: TSDataset, save_ts: bool = True) -> "StackingEnsemble": # Fit the base models self.pipelines = Parallel(n_jobs=self.n_jobs, **self.joblib_params)( - delayed(self._fit_pipeline)(pipeline=pipeline, ts=deepcopy(ts)) for pipeline in self.pipelines + delayed(self._fit_pipeline)(pipeline=pipeline, ts=ts) for pipeline in self.pipelines ) if save_ts: diff --git a/etna/ensembles/voting_ensemble.py b/etna/ensembles/voting_ensemble.py index 4a43632c2..a9286e699 100644 --- a/etna/ensembles/voting_ensemble.py +++ b/etna/ensembles/voting_ensemble.py @@ -1,4 +1,3 @@ -from copy import deepcopy from typing import Any from typing import Dict from typing import List @@ -132,7 +131,7 @@ def _process_weights(self, ts: TSDataset) -> List[float]: weights = [1.0 for _ in range(len(self.pipelines))] elif self.weights == "auto": forecasts = Parallel(n_jobs=self.n_jobs, **self.joblib_params)( - delayed(self._backtest_pipeline)(pipeline=pipeline, ts=deepcopy(ts)) for pipeline in self.pipelines + delayed(self._backtest_pipeline)(pipeline=pipeline, ts=ts) for pipeline in self.pipelines ) x = pd.concat( @@ -160,6 +159,10 @@ def _process_weights(self, ts: TSDataset) -> List[float]: def fit(self, ts: TSDataset, save_ts: bool = True) -> "VotingEnsemble": """Fit pipelines in ensemble. + Method doesn't change the given ``ts``. + + Saved ``ts`` is the link to given ``ts``. + Parameters ---------- ts: @@ -173,7 +176,7 @@ def fit(self, ts: TSDataset, save_ts: bool = True) -> "VotingEnsemble": Fitted ensemble """ self.pipelines = Parallel(n_jobs=self.n_jobs, **self.joblib_params)( - delayed(self._fit_pipeline)(pipeline=pipeline, ts=deepcopy(ts)) for pipeline in self.pipelines + delayed(self._fit_pipeline)(pipeline=pipeline, ts=ts) for pipeline in self.pipelines ) self.processed_weights = self._process_weights(ts=ts) diff --git a/etna/pipeline/autoregressive_pipeline.py b/etna/pipeline/autoregressive_pipeline.py index 1a708d29a..625daa135 100644 --- a/etna/pipeline/autoregressive_pipeline.py +++ b/etna/pipeline/autoregressive_pipeline.py @@ -1,4 +1,5 @@ import warnings +from copy import deepcopy from typing import Sequence from typing import cast @@ -94,6 +95,10 @@ def fit(self, ts: TSDataset, save_ts: bool = True) -> "AutoRegressivePipeline": Fit and apply given transforms to the data, then fit the model on the transformed data. + Method doesn't change the given ``ts``. + + Saved ``ts`` is the link to given ``ts``. + Parameters ---------- ts: @@ -106,9 +111,9 @@ def fit(self, ts: TSDataset, save_ts: bool = True) -> "AutoRegressivePipeline": : Fitted Pipeline instance """ - ts.fit_transform(self.transforms) - self.model.fit(ts) - ts.inverse_transform(self.transforms) + cur_ts = deepcopy(ts) + cur_ts.fit_transform(self.transforms) + self.model.fit(cur_ts) if save_ts: self.ts = ts diff --git a/etna/pipeline/hierarchical_pipeline.py b/etna/pipeline/hierarchical_pipeline.py index abca3eb76..dfdcee291 100644 --- a/etna/pipeline/hierarchical_pipeline.py +++ b/etna/pipeline/hierarchical_pipeline.py @@ -51,6 +51,10 @@ def fit(self, ts: TSDataset, save_ts: bool = True) -> "HierarchicalPipeline": Fit and apply given transforms to the data, then fit the model on the transformed data. Provided hierarchical dataset will be aggregated to the source level before fitting pipeline. + Method doesn't change the given ``ts``. + + Saved ``ts`` is the link to given ``ts``. + Parameters ---------- ts: diff --git a/etna/pipeline/pipeline.py b/etna/pipeline/pipeline.py index 8edbba14d..d69f4f7c4 100644 --- a/etna/pipeline/pipeline.py +++ b/etna/pipeline/pipeline.py @@ -1,3 +1,4 @@ +from copy import deepcopy from typing import Optional from typing import Sequence from typing import cast @@ -54,6 +55,10 @@ def fit(self, ts: TSDataset, save_ts: bool = True) -> "Pipeline": Fit and apply given transforms to the data, then fit the model on the transformed data. + Method doesn't change the given ``ts``. + + Saved ``ts`` is the link to given ``ts``. + Parameters ---------- ts: @@ -66,9 +71,9 @@ def fit(self, ts: TSDataset, save_ts: bool = True) -> "Pipeline": : Fitted Pipeline instance """ - ts.fit_transform(self.transforms) - self.model.fit(ts) - ts.inverse_transform(self.transforms) + cur_ts = deepcopy(ts) + cur_ts.fit_transform(self.transforms) + self.model.fit(cur_ts) if save_ts: self.ts = ts diff --git a/tests/test_datasets/test_dataset.py b/tests/test_datasets/test_dataset.py index 5a43a4e21..42e3461e1 100644 --- a/tests/test_datasets/test_dataset.py +++ b/tests/test_datasets/test_dataset.py @@ -17,6 +17,7 @@ from etna.datasets.utils import make_timestamp_df_from_alignment from etna.transforms import AddConstTransform from etna.transforms import DifferencingTransform +from etna.transforms import LagTransform from etna.transforms import TimeSeriesImputerTransform @@ -956,20 +957,45 @@ def test_train_test_split_pass_regressors_to_output(df_and_regressors): df, df_exog, known_future = df_and_regressors ts = TSDataset(df=df, df_exog=df_exog, freq="D", known_future=known_future) train, test = ts.train_test_split(test_size=5) + assert set(train.regressors).issubset(set(train.features)) + assert set(test.regressors).issubset(set(test.features)) + assert train.regressors == ts.regressors + assert test.regressors == ts.regressors + + +def test_train_test_split_pass_transform_regressors_to_output(df_and_regressors): + df, df_exog, known_future = df_and_regressors + ts = TSDataset(df=df, df_exog=df_exog, freq="D", known_future=known_future) + ts.fit_transform(transforms=[LagTransform(in_column="target", lags=[1, 2, 3])]) + train, test = ts.train_test_split(test_size=5) + assert set(train.regressors).issubset(set(train.features)) + assert set(test.regressors).issubset(set(test.features)) assert train.regressors == ts.regressors assert test.regressors == ts.regressors def test_train_test_split_pass_target_components_to_output(ts_with_target_components): train, test = ts_with_target_components.train_test_split(test_size=5) + train_target_components = train.get_target_components() + test_target_components = test.get_target_components() + assert set(train.target_components_names).issubset(set(train.features)) + assert set(test.target_components_names).issubset(set(test.features)) assert sorted(train.target_components_names) == sorted(ts_with_target_components.target_components_names) assert sorted(test.target_components_names) == sorted(ts_with_target_components.target_components_names) + assert set(train_target_components.columns.get_level_values("feature")) == set(train.target_components_names) + assert set(test_target_components.columns.get_level_values("feature")) == set(test.target_components_names) def test_train_test_split_pass_prediction_intervals_to_output(ts_with_prediction_intervals): train, test = ts_with_prediction_intervals.train_test_split(test_size=5) + train_prediction_intervals = train.get_prediction_intervals() + test_prediction_intervals = test.get_prediction_intervals() + assert set(train.prediction_intervals_names).issubset(set(train.features)) + assert set(test.prediction_intervals_names).issubset(set(test.features)) assert sorted(train.prediction_intervals_names) == sorted(ts_with_prediction_intervals.prediction_intervals_names) assert sorted(test.prediction_intervals_names) == sorted(ts_with_prediction_intervals.prediction_intervals_names) + assert set(train_prediction_intervals.columns.get_level_values("feature")) == set(train.prediction_intervals_names) + assert set(test_prediction_intervals.columns.get_level_values("feature")) == set(test.prediction_intervals_names) def test_to_dataset_datetime_conversion(): diff --git a/tests/test_datasets/test_hierarchical_dataset.py b/tests/test_datasets/test_hierarchical_dataset.py index 99f73317d..f7ba433ca 100644 --- a/tests/test_datasets/test_hierarchical_dataset.py +++ b/tests/test_datasets/test_hierarchical_dataset.py @@ -508,3 +508,13 @@ def test_get_level_dataset_with_target_components( reconciled_ts = product_level_constant_forecast_with_target_components.get_level_dataset(target_level=target_level) pd.testing.assert_frame_equal(reconciled_ts.get_target_components(), expected_ts.get_target_components()) pd.testing.assert_frame_equal(reconciled_ts.to_pandas(), expected_ts.to_pandas()) + + +def test_train_test_split_pass_hierarchy_to_output(simple_hierarchical_ts): + train, test = simple_hierarchical_ts.train_test_split(test_size=1) + assert train.hierarchical_structure is not None + assert test.hierarchical_structure is not None + assert train.hierarchical_structure.level_structure == simple_hierarchical_ts.hierarchical_structure.level_structure + assert test.hierarchical_structure.level_structure == simple_hierarchical_ts.hierarchical_structure.level_structure + assert train.hierarchical_structure.level_names == simple_hierarchical_ts.hierarchical_structure.level_names + assert test.hierarchical_structure.level_names == simple_hierarchical_ts.hierarchical_structure.level_names diff --git a/tests/test_pipeline/test_pipeline.py b/tests/test_pipeline/test_pipeline.py index b8dc0e917..b1980c32e 100644 --- a/tests/test_pipeline/test_pipeline.py +++ b/tests/test_pipeline/test_pipeline.py @@ -93,9 +93,7 @@ def test_fit(example_tsds, save_ts): transforms = [AddConstTransform(in_column="target", value=10, inplace=True), DateFlagsTransform()] pipeline = Pipeline(model=model, transforms=transforms, horizon=5) pipeline.fit(example_tsds, save_ts=save_ts) - original_ts.fit_transform(transforms) - original_ts.inverse_transform(transforms) - assert np.all(original_ts.df.values == example_tsds.df.values) + pd.testing.assert_frame_equal(original_ts.df, example_tsds.df) @pytest.mark.parametrize("save_ts", [False, True])