Merge pull request #60 from jdb78/feature/mase-metric

jdb78 · web-flow · commit 4fba73d1df62 · 2020-09-24T19:43:05.000+01:00
Add MASE metric
diff --git a/pytorch_forecasting/data/encoders.py b/pytorch_forecasting/data/encoders.py
@@ -94,7 +94,10 @@ def transform(self, y: Iterable) -> Union[torch.Tensor, np.ndarray]:
             if self.warn:
                 cond = ~np.isin(y, self.classes_)
                 if cond.any():
-                    warnings.warn(f"Found {y[cond].nunique()} unknown classes which were set to NaN", UserWarning)
+                    warnings.warn(
+                        f"Found {np.unique(np.asarray(y)[cond]).size} unknown classes which were set to NaN",
+                        UserWarning,
+                    )
 
             encoded = [self.classes_.get(v, 0) for v in y]
 
diff --git a/pytorch_forecasting/metrics.py b/pytorch_forecasting/metrics.py
@@ -11,7 +11,7 @@
 import torch.nn.functional as F
 from torch.nn.utils import rnn
 
-from pytorch_forecasting.utils import integer_histogram
+from pytorch_forecasting.utils import integer_histogram, unpack_sequence
 
 
 class Metric(TensorMetric):
@@ -281,13 +281,7 @@ def forward(self, y_pred: Dict[str, torch.Tensor], target: Union[torch.Tensor, r
         Returns:
             torch.Tensor: loss as a single number for backpropagation
         """
-        # unpack
-        if isinstance(target, rnn.PackedSequence):
-            target, lengths = rnn.pad_packed_sequence(target, batch_first=True)
-            # batch sizes reside on the CPU by default -> we need to bring them to GPU
-            lengths = lengths.to(target.device)
-        else:
-            lengths = torch.ones(target.size(0), device=target.device, dtype=torch.long) * target.size(1)
+        target, lengths = unpack_sequence(target)
         assert not target.requires_grad
 
         # calculate loss with "none" reduction
@@ -302,24 +296,43 @@ def forward(self, y_pred: Dict[str, torch.Tensor], target: Union[torch.Tensor, r
         if weight is not None:
             losses = losses * weight.unsqueeze(-1)
 
+        loss = self.reduce_loss(losses, lengths=lengths, reduction=self.reduction)
+        return loss
+
+    def reduce_loss(self, losses: torch.Tensor, lengths: torch.Tensor, reduction: str = None) -> torch.Tensor:
+        """
+        Reduce loss.
+
+        Args:
+            losses (torch.Tensor): tensor of losses. first dimenion are samples, second timesteps
+            lengths (torch.Tensor): tensor of lengths
+            reduction (str, optional): type of reduction. Defaults to ``self.reduction``.
+
+        Returns:
+            torch.Tensor: reduced loss
+        """
+        if reduction is None:
+            reduction = self.reduction
         # mask loss
-        mask = torch.arange(target.size(1), device=target.device).unsqueeze(0) >= lengths.unsqueeze(-1)
+        mask = torch.arange(losses.size(1), device=losses.device).unsqueeze(0) >= lengths.unsqueeze(-1)
         if losses.ndim > 2:
             mask = mask.unsqueeze(-1)
             dim_normalizer = losses.size(-1)
         else:
             dim_normalizer = 1.0
         # reduce to one number
-        if self.reduction == "none":
+        if reduction == "none":
             loss = losses.masked_fill(mask, float("nan"))
         else:
-            if self.reduction == "mean":
+            if reduction == "mean":
                 losses = losses.masked_fill(mask, 0.0)
                 loss = losses.sum() / lengths.sum() / dim_normalizer
-            elif self.reduction == "sqrt-mean":
+            elif reduction == "sqrt-mean":
                 losses = losses.masked_fill(mask, 0.0)
                 loss = losses.sum() / lengths.sum() / dim_normalizer
                 loss = loss.sqrt()
+            else:
+                raise ValueError(f"reduction {reduction} unknown")
             assert not torch.isnan(loss), (
                 "Loss should not be nan - i.e. something went wrong "
                 "in calculating the loss (e.g. log of a negative number)"
@@ -449,3 +462,100 @@ def __init__(self, name: str = "RMSE", reduction="sqrt-mean", *args, **kwargs):
     def loss(self, y_pred: Dict[str, torch.Tensor], target):
         loss = torch.pow(self.to_prediction(y_pred) - target, 2)
         return loss
+
+
+class MASE(MultiHorizonMetric):
+    """
+    Mean absolute scaled error
+
+    Defined as ``(y_pred - target).abs() / all_targets[:, :-1] - all_targets[:, 1:]).mean(1)``.
+    ``all_targets`` are here the concatenated encoder and decoder targets
+    """
+
+    def __init__(self, name: str = "MASE", *args, **kwargs):
+        super().__init__(name, *args, **kwargs)
+
+    def forward(
+        self,
+        y_pred: Dict[str, torch.Tensor],
+        target: Union[torch.Tensor, rnn.PackedSequence],
+        encoder_target: Union[torch.Tensor, rnn.PackedSequence],
+        encoder_lengths: torch.Tensor = None,
+    ) -> torch.Tensor:
+        """
+        Forward method of metric that handles masking of values.
+
+        Args:
+            y_pred (Dict[str, torch.Tensor]): network output
+            target (Union[torch.Tensor, rnn.PackedSequence]): actual values
+            encoder_target (Union[torch.Tensor, rnn.PackedSequence]): historic actual values
+            encoder_lengths (torch.Tensor): optional encoder lengths, not necessary if encoder_target
+                is rnn.PackedSequence. Assumed encoder_target is torch.Tensor
+
+        Returns:
+            torch.Tensor: loss as a single number for backpropagation
+        """
+        target, lengths = unpack_sequence(target)
+        if encoder_lengths is None:
+            encoder_target, encoder_lengths = unpack_sequence(target)
+        else:
+            assert isinstance(encoder_target, torch.Tensor)
+        assert not target.requires_grad
+
+        # calculate loss with "none" reduction
+        if target.ndim == 3:
+            weight = target[..., 1]
+            target = target[..., 0]
+        else:
+            weight = None
+
+        scaling = self.calculate_scaling(target, lengths, encoder_target, encoder_lengths)
+        losses = self.loss(y_pred, target, scaling)
+        # weight samples
+        if weight is not None:
+            losses = losses * weight.unsqueeze(-1)
+
+        loss = self.reduce_loss(losses, lengths=lengths, reduction=self.reduction)
+        return loss
+
+    def loss(self, y_pred, target, scaling):
+        return (y_pred - target).abs() / scaling.unsqueeze(-1)
+
+    def calculate_scaling(self, target, lengths, encoder_target, encoder_lengths):
+        # calcualte mean(abs(diff(targets)))
+        eps = 1e-6
+        batch_size = target.size(0)
+        total_lengths = lengths + encoder_lengths
+        assert (total_lengths > 1).all(), "Need at least 2 target values to be able to calculate MASE"
+        max_length = target.size(1) + encoder_target.size(1)
+        if (total_lengths != max_length).any():  # if decoder or encoder targets have sequences of different lengths
+            targets = torch.cat(
+                [
+                    encoder_target,
+                    torch.zeros(batch_size, target.size(1), device=target.device, dtype=encoder_target.dtype),
+                ],
+                dim=1,
+            )
+            target_index = torch.arange(target.size(1), device=target.device, dtype=torch.long).unsqueeze(0).expand(
+                batch_size, -1
+            ) + encoder_lengths.unsqueeze(-1)
+            targets.scatter_(dim=1, src=target, index=target_index)
+        else:
+            targets = torch.cat([encoder_target, target], dim=1)
+
+        # take absolute difference
+        diffs = (targets[:, :-1] - targets[:, 1:]).abs()
+
+        # set last difference to 0
+        not_maximum_length = total_lengths != max_length
+        zero_correction_indices = total_lengths[not_maximum_length] - 1
+        if len(zero_correction_indices) > 0:
+            diffs[
+                torch.arange(batch_size, dtype=torch.long, device=diffs.device)[not_maximum_length],
+                zero_correction_indices,
+            ] = 0.0
+
+        # calculate mean over differences
+        scaling = diffs.sum(1) / total_lengths + eps
+
+        return scaling
diff --git a/pytorch_forecasting/models/base_model.py b/pytorch_forecasting/models/base_model.py
@@ -20,7 +20,7 @@
 
 from pytorch_forecasting.data import TimeSeriesDataSet
 from pytorch_forecasting.data.encoders import GroupNormalizer
-from pytorch_forecasting.metrics import SMAPE
+from pytorch_forecasting.metrics import MASE, SMAPE
 from pytorch_forecasting.optim import Ranger
 from pytorch_forecasting.utils import groupby_apply
 
@@ -182,23 +182,40 @@ def step(self, x: Dict[str, torch.Tensor], y: torch.Tensor, batch_idx: int, labe
             # multiply monotinicity loss by large number to ensure relevance and take to the power of 2
             # for smoothness of loss function
             monotinicity_loss = 10 * torch.pow(monotinicity_loss, 2)
+            if isinstance(self.loss, MASE):
+                loss = self.loss(
+                    prediction, y, encoder_target=x["encoder_target"], encoder_lengths=x["encoder_lengths"]
+                )
+            else:
+                loss = self.loss(prediction, y)
 
-            loss = self.loss(prediction, y) * (1 + monotinicity_loss)
+            loss = loss * (1 + monotinicity_loss)
         else:
             out = self(x)
             out["prediction"] = self.transform_output(out)
 
             # calculate loss
             prediction = out["prediction"]
-            loss = self.loss(prediction, y)
+            if isinstance(self.loss, MASE):
+                loss = self.loss(
+                    prediction, y, encoder_target=x["encoder_target"], encoder_lengths=x["encoder_lengths"]
+                )
+            else:
+                loss = self.loss(prediction, y)
 
         # log loss
         tensorboard_logs = {f"{label}_loss": loss}
         # logging losses
         y_hat_detached = prediction.detach()
         y_hat_point_detached = self.loss.to_prediction(y_hat_detached)
         for metric in self.logging_metrics:
-            tensorboard_logs[f"{label}_{metric.name}"] = metric(y_hat_point_detached, y)
+            if isinstance(metric, MASE):
+                loss_value = metric(
+                    y_hat_point_detached, y, encoder_target=x["encoder_target"], encoder_lengths=x["encoder_lengths"]
+                )
+            else:
+                loss_value = metric(y_hat_point_detached, y)
+            tensorboard_logs[f"{label}_{metric.name}"] = loss_value
         log = {f"{label}_loss": loss, "log": tensorboard_logs, "n_samples": x["decoder_lengths"].size(0)}
         if label == "train":
             log["loss"] = loss
@@ -354,7 +371,11 @@ def plot_prediction(
             else:
                 loss = add_loss_to_title
                 loss.quantiles = self.loss.quantiles
-            ax.set_title(f"Loss {loss(y_hat[None], y[-n_pred:][None]):.3g}")
+            if isinstance(loss, MASE):
+                loss_value = loss(y_hat[None], y[-n_pred:][None], y[:n_pred][None])
+            else:
+                loss_value = loss(y_hat[None], y[-n_pred:][None])
+            ax.set_title(f"Loss {loss_value:.3g}")
         ax.set_xlabel("Time index")
         fig.legend()
         return fig
diff --git a/pytorch_forecasting/models/nbeats/__init__.py b/pytorch_forecasting/models/nbeats/__init__.py
@@ -8,7 +8,7 @@
 from torch import nn
 
 from pytorch_forecasting.data import TimeSeriesDataSet
-from pytorch_forecasting.metrics import MAE, MAPE, RMSE, SMAPE
+from pytorch_forecasting.metrics import MAE, MAPE, MASE, RMSE, SMAPE
 from pytorch_forecasting.models.base_model import BaseModel
 from pytorch_forecasting.models.nbeats.sub_modules import NBEATSGenericBlock, NBEATSSeasonalBlock, NBEATSTrendBlock
 
@@ -78,7 +78,7 @@ def __init__(
             reduce_on_plateau_patience (int): patience after which learning rate is reduced by a factor of 10
         """
         self.save_hyperparameters()
-        self.logging_metrics = [SMAPE(), MAE(), RMSE(), MAPE()]
+        self.logging_metrics = [SMAPE(), MAE(), RMSE(), MAPE(), MASE()]
         super().__init__(**kwargs)
         self.loss = loss
 
@@ -218,7 +218,10 @@ def step(self, x, y, batch_idx, label) -> Dict[str, torch.Tensor]:
             )
             backcast_weight = backcast_weight / (backcast_weight + 1)  # normalize
             forecast_weight = 1 - backcast_weight
-            backcast_loss = self.loss(backcast, x["encoder_target"]) * backcast_weight
+            if isinstance(self.loss, MASE):
+                backcast_loss = self.loss(backcast, x["encoder_target"], x["decoder_target"]) * backcast_weight
+            else:
+                backcast_loss = self.loss(backcast, x["encoder_target"]) * backcast_weight
             if label == "train":
                 log["loss"] = log["loss"] * forecast_weight + backcast_loss
                 log["log"]["train_loss"] = log["log"]["train_loss"] * forecast_weight + backcast_loss
diff --git a/pytorch_forecasting/models/temporal_fusion_transformer/__init__.py b/pytorch_forecasting/models/temporal_fusion_transformer/__init__.py
@@ -10,7 +10,7 @@
 from torch.nn.utils import rnn
 
 from pytorch_forecasting.data import TimeSeriesDataSet
-from pytorch_forecasting.metrics import MAE, MAPE, RMSE, SMAPE, MultiHorizonMetric, QuantileLoss
+from pytorch_forecasting.metrics import MAE, MAPE, MASE, RMSE, SMAPE, MultiHorizonMetric, QuantileLoss
 from pytorch_forecasting.models.base_model import BaseModel, CovariatesMixin
 from pytorch_forecasting.models.temporal_fusion_transformer.sub_modules import (
     AddNorm,
@@ -126,7 +126,7 @@ def __init__(
         assert isinstance(loss, MultiHorizonMetric), "Loss has to of class `MultiHorizonMetric`"
         self.loss = loss
         self.output_transformer = output_transformer
-        self.logging_metrics = [SMAPE(), MAE(), RMSE(), MAPE()]
+        self.logging_metrics = [SMAPE(), MAE(), RMSE(), MAPE(), MASE()]
 
         # processing inputs
         # embeddings
diff --git a/pytorch_forecasting/models/temporal_fusion_transformer/sub_modules.py b/pytorch_forecasting/models/temporal_fusion_transformer/sub_modules.py
@@ -351,15 +351,15 @@ def forward(self, x: Dict[str, torch.Tensor], context: torch.Tensor = None):
                     variable_embedding = self.prescalers[name](variable_embedding)
                 weight_inputs.append(variable_embedding)
                 var_outputs.append(self.single_variable_grns[name](variable_embedding))
-            var_outputs = torch.stack(var_outputs, axis=-1)
+            var_outputs = torch.stack(var_outputs, dim=-1)
 
             # calculate variable weights
             flat_embedding = torch.cat(weight_inputs, dim=-1)
             sparse_weights = self.flattened_grn(flat_embedding, context)
             sparse_weights = self.softmax(sparse_weights).unsqueeze(-2)
 
             outputs = var_outputs * sparse_weights
-            outputs = outputs.sum(axis=-1)
+            outputs = outputs.sum(dim=-1)
         else:  # for one input, do not perform variable selection but just encoding
             name = next(iter(self.single_variable_grns.keys()))
             variable_embedding = x[name]
diff --git a/pytorch_forecasting/utils.py b/pytorch_forecasting/utils.py
@@ -7,6 +7,8 @@
 from typing import Callable, Tuple, Union
 
 import torch
+from torch.nn.utils import rnn
+from torch.tensor import Tensor
 
 
 def integer_histogram(
@@ -179,3 +181,23 @@ def autocorrelation(input, dim=0):
     autocorr = autocorr / torch.tensor(range(N, 0, -1), dtype=input.dtype, device=input.device)
     autocorr = autocorr / autocorr[..., :1]
     return autocorr.transpose(dim, -1)
+
+
+def unpack_sequence(sequence: Union[torch.Tensor, rnn.PackedSequence]) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Unpack RNN sequence.
+
+    Args:
+        sequence (Union[torch.Tensor, rnn.PackedSequence]): RNN packed sequence or tensor of which
+            first index are samples and second are timesteps
+
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: tuple of unpacked sequence and length of samples
+    """
+    if isinstance(sequence, rnn.PackedSequence):
+        sequence, lengths = rnn.pad_packed_sequence(sequence, batch_first=True)
+        # batch sizes reside on the CPU by default -> we need to bring them to GPU
+        lengths = lengths.to(sequence.device)
+    else:
+        lengths = torch.ones(sequence.size(0), device=sequence.device, dtype=torch.long) * sequence.size(1)
+    return sequence, lengths