[ENH] refactor test data scenario generation to tests._data_scenarios (#1877)

fkiraly · web-flow · commit edd718d617b5 · 2025-06-12T00:41:51.000+02:00
This PR moves all test data scenario generation to
`tests._data_scenarios`.

It also deduplicates current data scenario code.
diff --git a/pytorch_forecasting/models/mlp/_decodermlp_metadata.py b/pytorch_forecasting/models/mlp/_decodermlp_metadata.py
@@ -85,12 +85,12 @@ def _get_test_dataloaders_from(cls, params):
         """
         data_loader_kwargs = params.get("data_loader_kwargs", {})
 
-        from pytorch_forecasting.tests._conftest import (
-            _data_with_covariates,
+        from pytorch_forecasting.tests._data_scenarios import (
+            data_with_covariates,
             make_dataloaders,
         )
 
-        dwc = _data_with_covariates()
+        dwc = data_with_covariates()
         dwc.assign(target=lambda x: x.volume)
         dl_default_kwargs = dict(
             target="target",
diff --git a/pytorch_forecasting/models/nbeats/_nbeats_metadata.py b/pytorch_forecasting/models/nbeats/_nbeats_metadata.py
@@ -54,8 +54,8 @@ def _get_test_dataloaders_from(cls, params):
             Dict of dataloaders created from the parameters.
             Train, validation, and test dataloaders, in this order.
         """
-        from pytorch_forecasting.tests._conftest import (
-            _dataloaders_fixed_window_without_covariates,
+        from pytorch_forecasting.tests._data_scenarios import (
+            dataloaders_fixed_window_without_covariates,
         )
 
-        return _dataloaders_fixed_window_without_covariates()
+        return dataloaders_fixed_window_without_covariates()
diff --git a/pytorch_forecasting/tests/_conftest.py b/pytorch_forecasting/tests/_conftest.py
@@ -19,47 +19,11 @@ def gpus():
 
 @pytest.fixture(scope="session")
 def data_with_covariates():
-    return _data_with_covariates()
-
-
-def _data_with_covariates():
-    data = get_stallion_data()
-    data["month"] = data.date.dt.month.astype(str)
-    data["log_volume"] = np.log1p(data.volume)
-    data["weight"] = 1 + np.sqrt(data.volume)
-
-    data["time_idx"] = data["date"].dt.year * 12 + data["date"].dt.month
-    data["time_idx"] -= data["time_idx"].min()
-
-    # convert special days into strings
-    special_days = [
-        "easter_day",
-        "good_friday",
-        "new_year",
-        "christmas",
-        "labor_day",
-        "independence_day",
-        "revolution_day_memorial",
-        "regional_games",
-        "fifa_u_17_world_cup",
-        "football_gold_cup",
-        "beer_capital",
-        "music_fest",
-    ]
-    data[special_days] = (
-        data[special_days].apply(lambda x: x.map({0: "", 1: x.name})).astype("category")
+    from pytorch_forecasting.tests._data_scenarios import (
+        data_with_covariates as _data_with_covariates,
     )
-    data = data.astype(dict(industry_volume=float))
-
-    # select data subset
-    data = data[lambda x: x.sku.isin(data.sku.unique()[:2])][
-        lambda x: x.agency.isin(data.agency.unique()[:2])
-    ]
 
-    # default target
-    data["target"] = data["volume"].clip(1e-3, 1.0)
-
-    return data
+    return _data_with_covariates()
 
 
 def make_dataloaders(data_with_covariates, **kwargs):
@@ -161,47 +125,12 @@ def multiple_dataloaders_with_covariates(data_with_covariates, request):
 
 @pytest.fixture(scope="session")
 def dataloaders_with_different_encoder_decoder_length(data_with_covariates):
-    return make_dataloaders(
-        data_with_covariates.copy(),
-        target="target",
-        time_varying_known_categoricals=["special_days", "month"],
-        variable_groups=dict(
-            special_days=[
-                "easter_day",
-                "good_friday",
-                "new_year",
-                "christmas",
-                "labor_day",
-                "independence_day",
-                "revolution_day_memorial",
-                "regional_games",
-                "fifa_u_17_world_cup",
-                "football_gold_cup",
-                "beer_capital",
-                "music_fest",
-            ]
-        ),
-        time_varying_known_reals=[
-            "time_idx",
-            "price_regular",
-            "price_actual",
-            "discount",
-            "discount_in_percent",
-        ],
-        time_varying_unknown_categoricals=[],
-        time_varying_unknown_reals=[
-            "target",
-            "volume",
-            "log_volume",
-            "industry_volume",
-            "soda_volume",
-            "avg_max_temp",
-        ],
-        static_categoricals=["agency"],
-        add_relative_time_idx=False,
-        target_normalizer=GroupNormalizer(groups=["agency", "sku"], center=False),
+    from pytorch_forecasting.tests._data_scenarios import (
+        dataloaders_with_different_encoder_decoder_length as _dataloader,
     )
 
+    return _dataloader()
+
 
 @pytest.fixture(scope="session")
 def dataloaders_with_covariates(data_with_covariates):
@@ -228,43 +157,8 @@ def dataloaders_multi_target(data_with_covariates):
 
 @pytest.fixture(scope="session")
 def dataloaders_fixed_window_without_covariates():
-    return _dataloaders_fixed_window_without_covariates()
-
-
-def _dataloaders_fixed_window_without_covariates():
-    data = generate_ar_data(seasonality=10.0, timesteps=50, n_series=2)
-    validation = data.series.iloc[:2]
-
-    max_encoder_length = 30
-    max_prediction_length = 10
-
-    training = TimeSeriesDataSet(
-        data[lambda x: ~x.series.isin(validation)],
-        time_idx="time_idx",
-        target="value",
-        categorical_encoders={"series": NaNLabelEncoder().fit(data.series)},
-        group_ids=["series"],
-        static_categoricals=[],
-        max_encoder_length=max_encoder_length,
-        max_prediction_length=max_prediction_length,
-        time_varying_unknown_reals=["value"],
-        target_normalizer=EncoderNormalizer(),
+    from pytorch_forecasting.tests._data_scenarios import (
+        dataloaders_fixed_window_without_covariates as _dataloader,
     )
 
-    validation = TimeSeriesDataSet.from_dataset(
-        training,
-        data[lambda x: x.series.isin(validation)],
-        stop_randomization=True,
-    )
-    batch_size = 2
-    train_dataloader = training.to_dataloader(
-        train=True, batch_size=batch_size, num_workers=0
-    )
-    val_dataloader = validation.to_dataloader(
-        train=False, batch_size=batch_size, num_workers=0
-    )
-    test_dataloader = validation.to_dataloader(
-        train=False, batch_size=batch_size, num_workers=0
-    )
-
-    return dict(train=train_dataloader, val=val_dataloader, test=test_dataloader)
+    return _dataloader()
diff --git a/pytorch_forecasting/tests/_data_scenarios.py b/pytorch_forecasting/tests/_data_scenarios.py
@@ -1,5 +1,4 @@
 import numpy as np
-import pytest
 import torch
 
 from pytorch_forecasting import TimeSeriesDataSet
@@ -9,14 +8,6 @@
 torch.manual_seed(23)
 
 
-@pytest.fixture(scope="session")
-def gpus():
-    if torch.cuda.is_available():
-        return [0]
-    else:
-        return 0
-
-
 def data_with_covariates():
     data = get_stallion_data()
     data["month"] = data.date.dt.month.astype(str)
@@ -87,77 +78,9 @@ def make_dataloaders(data_with_covariates, **kwargs):
     return dict(train=train_dataloader, val=val_dataloader, test=test_dataloader)
 
 
-@pytest.fixture(
-    params=[
-        dict(),
-        dict(
-            static_categoricals=["agency", "sku"],
-            static_reals=["avg_population_2017", "avg_yearly_household_income_2017"],
-            time_varying_known_categoricals=["special_days", "month"],
-            variable_groups=dict(
-                special_days=[
-                    "easter_day",
-                    "good_friday",
-                    "new_year",
-                    "christmas",
-                    "labor_day",
-                    "independence_day",
-                    "revolution_day_memorial",
-                    "regional_games",
-                    "fifa_u_17_world_cup",
-                    "football_gold_cup",
-                    "beer_capital",
-                    "music_fest",
-                ]
-            ),
-            time_varying_known_reals=[
-                "time_idx",
-                "price_regular",
-                "price_actual",
-                "discount",
-                "discount_in_percent",
-            ],
-            time_varying_unknown_categoricals=[],
-            time_varying_unknown_reals=[
-                "volume",
-                "log_volume",
-                "industry_volume",
-                "soda_volume",
-                "avg_max_temp",
-            ],
-            constant_fill_strategy={"volume": 0},
-            categorical_encoders={"sku": NaNLabelEncoder(add_nan=True)},
-        ),
-        dict(static_categoricals=["agency", "sku"]),
-        dict(randomize_length=True, min_encoder_length=2),
-        dict(target_normalizer=EncoderNormalizer(), min_encoder_length=2),
-        dict(target_normalizer=GroupNormalizer(transformation="log1p")),
-        dict(
-            target_normalizer=GroupNormalizer(
-                groups=["agency", "sku"], transformation="softplus", center=False
-            )
-        ),
-        dict(target="agency"),
-        # test multiple targets
-        dict(target=["industry_volume", "volume"]),
-        dict(target=["agency", "volume"]),
-        dict(
-            target=["agency", "volume"], min_encoder_length=1, min_prediction_length=1
-        ),
-        dict(target=["agency", "volume"], weight="volume"),
-        # test weights
-        dict(target="volume", weight="volume"),
-    ],
-    scope="session",
-)
-def multiple_dataloaders_with_covariates(data_with_covariates, request):
-    return make_dataloaders(data_with_covariates, **request.param)
-
-
-@pytest.fixture(scope="session")
-def dataloaders_with_different_encoder_decoder_length(data_with_covariates):
+def dataloaders_with_different_encoder_decoder_length():
     return make_dataloaders(
-        data_with_covariates.copy(),
+        data_with_covariates(),
         target="target",
         time_varying_known_categoricals=["special_days", "month"],
         variable_groups=dict(
@@ -198,10 +121,9 @@ def dataloaders_with_different_encoder_decoder_length(data_with_covariates):
     )
 
 
-@pytest.fixture(scope="session")
-def dataloaders_with_covariates(data_with_covariates):
+def dataloaders_with_covariates():
     return make_dataloaders(
-        data_with_covariates.copy(),
+        data_with_covariates(),
         target="target",
         time_varying_known_reals=["discount"],
         time_varying_unknown_reals=["target"],
@@ -211,17 +133,15 @@ def dataloaders_with_covariates(data_with_covariates):
     )
 
 
-@pytest.fixture(scope="session")
-def dataloaders_multi_target(data_with_covariates):
+def dataloaders_multi_target():
     return make_dataloaders(
-        data_with_covariates.copy(),
+        data_with_covariates(),
         time_varying_unknown_reals=["target", "discount"],
         target=["target", "discount"],
         add_relative_time_idx=False,
     )
 
 
-@pytest.fixture(scope="session")
 def dataloaders_fixed_window_without_covariates():
     data = generate_ar_data(seasonality=10.0, timesteps=50, n_series=2)
     validation = data.series.iloc[:2]
diff --git a/tests/test_models/conftest.py b/tests/test_models/conftest.py