Merge pull request #62 from jdb78/feature/time-synchronized-sampling

jdb78 · web-flow · commit eaa7e5390f60 · 2020-09-26T17:39:13.000+01:00
Add time-synchronized sampling to dataset
diff --git a/pytorch_forecasting/data/__init__.py b/pytorch_forecasting/data/__init__.py
@@ -5,6 +5,13 @@
 to abstracts the necessary work.
 """
 from pytorch_forecasting.data.encoders import EncoderNormalizer, GroupNormalizer, NaNLabelEncoder, TorchNormalizer
-from pytorch_forecasting.data.timeseries import TimeSeriesDataSet
+from pytorch_forecasting.data.timeseries import TimeSeriesDataSet, TimeSynchronizedBatchSampler
 
-__all__ = ["TimeSeriesDataSet", "NaNLabelEncoder", "GroupNormalizer", "TorchNormalizer", "EncoderNormalizer"]
+__all__ = [
+    "TimeSeriesDataSet",
+    "NaNLabelEncoder",
+    "GroupNormalizer",
+    "TorchNormalizer",
+    "EncoderNormalizer",
+    "TimeSynchronizedBatchSampler",
+]
diff --git a/pytorch_forecasting/data/timeseries.py b/pytorch_forecasting/data/timeseries.py
@@ -14,11 +14,13 @@
 import pandas as pd
 from sklearn.exceptions import NotFittedError
 from sklearn.preprocessing import StandardScaler
+from sklearn.utils import shuffle
 from sklearn.utils.validation import check_is_fitted
 import torch
 from torch.distributions import Beta
 from torch.nn.utils import rnn
 from torch.utils.data import DataLoader, Dataset
+from torch.utils.data.sampler import Sampler
 
 from pytorch_forecasting.data.encoders import EncoderNormalizer, GroupNormalizer, NaNLabelEncoder, TorchNormalizer
 
@@ -655,6 +657,9 @@ def _construct_index(self, data: pd.DataFrame, predict_mode: bool) -> pd.DataFra
             # prediction must be for after minimal prediction index + length of prediction
             (x["sequence_length"] + x["time"] - 1 >= self.min_prediction_idx - 1 + self.min_prediction_length)
         ]
+        # todo: add duplicates for
+        # (x.sequence length > self.min_prediction_length + self.min_encoder_length) &
+        # (x.time - x.time_start < self.max_prediction_length + self.max_encoder_length)
 
         if predict_mode:  # keep longest element per series (i.e. the first element that spans to the end of the series)
             # filter all elements that are longer than the allowed maximum sequence length
@@ -766,6 +771,7 @@ def __getitem__(self, idx: int) -> Tuple[Dict[str, torch.Tensor], torch.Tensor]:
         # fill in missing values (if not all time indices are specified
         sequence_length = len(time)
         if sequence_length < index.sequence_length:
+            assert self.allow_missings, "allow_missings should be True if sequences have gaps"
             repetitions = torch.cat([time[1:] - time[:-1], torch.ones(1, dtype=time.dtype)])
             indices = torch.repeat_interleave(torch.arange(len(time)), repetitions)
             repetition_indices = torch.cat([torch.tensor([False], dtype=torch.bool), indices[1:] == indices[:-1]])
@@ -970,14 +976,21 @@ def _collate_fn(
             target,
         )
 
-    def to_dataloader(self, train: bool = True, batch_size: int = 64, **kwargs) -> DataLoader:
+    def to_dataloader(
+        self, train: bool = True, batch_size: int = 64, batch_sampler: Union[Sampler, str] = None, **kwargs
+    ) -> DataLoader:
         """
         Get dataloader from dataset.
 
         Args:
             train (bool, optional): if dataloader is used for training or prediction
                 Will shuffle and drop last batch if True. Defaults to True.
             batch_size (int): batch size for training model. Defaults to 64.
+            batch_sampler (Union[Sampler, str]): batch sampler or string. One of
+
+                * "synchronized": ensure that samples in decoder are aligned in time. Does not support missing
+                  values in dataset.
+
             **kwargs: additional arguments to ``DataLoader()``
 
 
@@ -1015,12 +1028,26 @@ def to_dataloader(self, train: bool = True, batch_size: int = 64, **kwargs) -> D
             drop_last=train and len(self) > batch_size,
             collate_fn=self._collate_fn,
             batch_size=batch_size,
+            batch_sampler=batch_sampler,
         )
-
         default_kwargs.update(kwargs)
+        kwargs = default_kwargs
+        if kwargs["batch_sampler"] is not None:
+            sampler = kwargs["batch_sampler"]
+            if isinstance(sampler, str):
+                if sampler == "synchronized":
+                    kwargs["batch_sampler"] = TimeSynchronizedBatchSampler(
+                        self, batch_size=kwargs["batch_size"], shuffle=kwargs["shuffle"], drop_last=kwargs["drop_last"]
+                    )
+                else:
+                    raise ValueError(f"batch_sampler {sampler} unknown - see docstring for valid batch_sampler")
+            del kwargs["batch_size"]
+            del kwargs["shuffle"]
+            del kwargs["drop_last"]
+
         return DataLoader(
             self,
-            **default_kwargs,
+            **kwargs,
         )
 
     def get_index(self) -> pd.DataFrame:
@@ -1045,3 +1072,106 @@ def get_index(self) -> pd.DataFrame:
             index_data[id] = self.transform_values(id, index_data[id], inverse=True)
         index = pd.DataFrame(index_data, index=self.index.index)
         return index
+
+
+class TimeSynchronizedBatchSampler(Sampler):
+    """
+    Samples mini-batches randomly but in a time-synchronised manner.
+
+    Time-synchornisation means that the time index of the first decoder samples are aligned across the batch.
+    This sampler does not support missing values in the dataset.
+    """
+
+    def __init__(
+        self,
+        data_source: TimeSeriesDataSet,
+        batch_size: int = 64,
+        shuffle: bool = False,
+        drop_last: bool = False,
+    ):
+        """
+        Initialize TimeSynchronizedBatchSampler.
+
+        Args:
+            data_source (TimeSeriesDataSet): timeseries dataset.
+            drop_last (bool): if to drop last mini-batch from a group if it is smaller than batch_size.
+                Defaults to False.
+            shuffle (bool): if to shuffle dataset. Defaults to False.
+            batch_size (int, optional): Number of samples in a mini-batch. This is rather the maximum number
+                of samples. Because mini-batches are grouped by prediction time, chances are that there
+                are multiple where batch size will be smaller than the maximum. Defaults to 64.
+        """
+        # Since collections.abc.Iterable does not check for `__getitem__`, which
+        # is one way for an object to be an iterable, we don't do an `isinstance`
+        # check here.
+        if not isinstance(batch_size, int) or isinstance(batch_size, bool) or batch_size <= 0:
+            raise ValueError(
+                "batch_size should be a positive integer value, " "but got batch_size={}".format(batch_size)
+            )
+        if not isinstance(drop_last, bool):
+            raise ValueError("drop_last should be a boolean value, but got " "drop_last={}".format(drop_last))
+        self.data_source = data_source
+        self.batch_size = batch_size
+        self.drop_last = drop_last
+        self.shuffle = shuffle
+        assert not self.data_source.allow_missings, "allow_missings should be False for time-synchronized mini-batches"
+
+        # construct index from which can be sampled
+        self.construct_batch_groups()
+
+    def construct_batch_groups(self):
+        """
+        Construct index of batches from which can be sampled
+        """
+        index = self.data_source.index
+        # get groups, i.e. group all samples by first predict time
+        decoder_lengths = np.min(
+            [
+                index.time_last - (self.data_source.min_prediction_idx - 1),
+                index.sequence_length - self.data_source.min_encoder_length,
+            ],
+            axis=0,
+        ).clip(max=self.data_source.max_prediction_length)
+        first_prediction_time = index.time + index.sequence_length - decoder_lengths + 1
+        self._groups = pd.RangeIndex(0, len(index.index)).groupby(first_prediction_time)
+
+        # calculate sizes of groups
+        self._group_sizes = {}
+        warns = []
+        for name, group in self._groups.items():  # iterate over groups
+            if self.drop_last:
+                self._group_sizes[name] = len(group) // self.batch_size
+            else:
+                self._group_sizes[name] = (len(group) + self.batch_size - 1) // self.batch_size
+            if self._group_sizes[name] == 0:
+                self._group_sizes[name] = 1
+                warns.append(name)
+        if len(warns) > 0:
+            warnings.warn(
+                f"Less than {self.batch_size} samples available for {len(warns)} prediction times. "
+                f"Use batch size smaller than {self.batch_size}. "
+                f"First 10 prediction times with small batch sizes: {warns[:10]}"
+            )
+        # create index from which can be sampled: index is equal to number of batches
+        # associate index with prediction time
+        self._group_index = np.repeat(list(self._group_sizes.keys()), list(self._group_sizes.values()))
+        # associate index with batch within prediction time group
+        self._sub_group_index = np.concatenate([np.arange(size) for size in self._group_sizes.values()])
+
+    def __iter__(self):
+        if self.shuffle:  # shuffle samples
+            groups = {name: shuffle(group) for name, group in self._groups.items()}
+        else:
+            groups = self._groups
+
+        batch_samples = np.random.permutation(len(self))
+        for idx in batch_samples:
+            name = self._group_index[idx]
+            sub_group = self._sub_group_index[idx]
+            sub_group_start = sub_group * self.batch_size
+            sub_group_end = sub_group_start + self.batch_size
+            batch = groups[name][sub_group_start:sub_group_end]
+            yield batch
+
+    def __len__(self):
+        return len(self._group_index)
diff --git a/pytorch_forecasting/models/temporal_fusion_transformer/__init__.py b/pytorch_forecasting/models/temporal_fusion_transformer/__init__.py
@@ -95,6 +95,9 @@ def __init__(
             time_varying_categoricals_decoder: integer of positions of categorical variables for decoder
             time_varying_reals_encoder: integer of positions of continuous variables for encoder
             time_varying_reals_decoder: integer of positions of continuous variables for decoder
+            categorical_groups: dictionary where values
+                are list of categorical variables that are forming together a new categorical
+                variable which is the key in the dictionary
             x_reals: order of continuous variables in tensor passed to forward function
             x_categoricals: order of categorical variables in tensor passed to forward function
             hidden_continuous_size: default for hidden size for processing continous variables (similar to categorical
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -47,7 +47,7 @@ def test_dataset(test_data):
         test_data,
         time_idx="time_idx",
         target="volume",
-        time_varying_known_reals=["price_regular"],
+        time_varying_known_reals=["price_regular", "time_idx"],
         group_ids=["agency", "sku"],
         static_categoricals=["agency"],
         max_encoder_length=5,
diff --git a/tests/test_data.py b/tests/test_data.py
@@ -8,7 +8,13 @@
 from sklearn.preprocessing import StandardScaler
 import torch
 
-from pytorch_forecasting.data import EncoderNormalizer, GroupNormalizer, NaNLabelEncoder, TimeSeriesDataSet
+from pytorch_forecasting.data import (
+    EncoderNormalizer,
+    GroupNormalizer,
+    NaNLabelEncoder,
+    TimeSeriesDataSet,
+    TimeSynchronizedBatchSampler,
+)
 from pytorch_forecasting.data.examples import get_stallion_data
 
 torch.manual_seed(23)
@@ -263,3 +269,30 @@ def test_overwrite_values(test_dataset, value, variable, target):
         changed = torch.isclose(outputs[0][name], control_outputs[0][name]).all()
         assert changed, f"Output {name} should be reset"
     assert torch.isclose(outputs[1], control_outputs[1]).all(), "Target should be reset"
+
+
+@pytest.mark.parametrize(
+    "drop_last,shuffle,as_string,batch_size",
+    [
+        (True, True, True, 64),
+        (False, False, False, 64),
+        (True, False, False, 1000),
+    ],
+)
+def test_TimeSynchronizedBatchSampler(test_dataset, shuffle, drop_last, as_string, batch_size):
+    if as_string:
+        dataloader = test_dataset.to_dataloader(
+            batch_sampler="synchronized", shuffle=shuffle, drop_last=drop_last, batch_size=batch_size
+        )
+    else:
+        sampler = TimeSynchronizedBatchSampler(
+            data_source=test_dataset, shuffle=shuffle, drop_last=drop_last, batch_size=batch_size
+        )
+        dataloader = test_dataset.to_dataloader(batch_sampler=sampler)
+
+    time_idx_pos = test_dataset.reals.index("time_idx")
+    for x, _ in iter(dataloader):  # check all samples
+        time_idx_of_first_prediction = x["decoder_cont"][:, 0, time_idx_pos]
+        assert torch.isclose(
+            time_idx_of_first_prediction, time_idx_of_first_prediction[0]
+        ).all(), "Time index should be the same for the first prediction"