Make BetaDistributionLoss work

Jan Beitner · Jan Beitner · commit 9c4d0680c3c9 · 2020-12-02T18:40:36.000Z
diff --git a/pytorch_forecasting/data/encoders.py b/pytorch_forecasting/data/encoders.py
@@ -230,10 +230,17 @@ def preprocess(self, y: Union[pd.Series, np.ndarray, torch.Tensor]) -> Union[np.
         Returns:
             Union[np.ndarray, torch.Tensor]: return rescaled series with type depending on input type
         """
-        y = y + self.eps
         if self.transformation is None:
-            pass
-        elif isinstance(y, torch.Tensor):
+            return y
+
+        # protect against numerical instabilities
+        if isinstance(self.transformation, str) and self.transformation == "logit":
+            # need to apply eps slightly differently
+            y = y / (1 + 2 * self.eps) + self.eps
+        else:
+            y = y + self.eps
+
+        if isinstance(y, torch.Tensor):
             y = self.TRANSFORMATIONS.get(self.transformation, self.transformation)[0](y)
         else:
             # convert first to tensor, then transform and then convert to numpy array
diff --git a/pytorch_forecasting/metrics.py b/pytorch_forecasting/metrics.py
@@ -841,10 +841,9 @@ class BetaDistributionLoss(DistributionLoss):
     Beta distribution loss for unit interval data.
 
     Requirements for original target normalizer:
-        * coerced to be positive
-        * not centered normalization (only rescaled)
-        * normalized target not in log space
+        * logit transformation
     """
+
     distribution_class = distributions.Beta
     distribution_arguments = ["mean", "shape"]
 
@@ -854,17 +853,20 @@ def map_x_to_distribution(self, x: torch.Tensor) -> distributions.Beta:
         return self.distribution_class(concentration0=(1 - mean) * shape, concentration1=mean * shape)
 
     def rescale_parameters(
-        self, parameters: torch.Tensor, target_scale: torch.Tensor, transformer: BaseEstimator
+        self, parameters: torch.Tensor, target_scale: torch.Tensor, encoder: BaseEstimator
     ) -> torch.Tensor:
-        assert transformer.coerce_positive, "Beta distribution is only compatible with strictly positive data"
-        assert (
-            not transformer.log_scale
-        ), "Beta distribution is not compatible with log transformation - use LogNormal"
-        assert not transformer.center, "Beta distribution is not compatible with centered data"
-
-        scaled_mean = torch.sigmoid(parameters[..., 0] + target_scale[..., 0].unsqueeze(1))
-        return torch.stack([
-            scaled_mean,
-            F.softplus(parameters[..., 1]) * scaled_mean * (1 - scaled_mean)
-            / torch.pow(target_scale[..., 1].unsqueeze(1), 2)
-        ], dim=-1)
+        assert encoder.transformation in ["logit"], "Beta distribution is only compatible with logit transformation"
+        assert encoder.center, "Beta distribution requires normalizer to center data"
+
+        scaled_mean = encoder(dict(prediction=parameters[..., 0], target_scale=target_scale))
+        # need to first transform target scale standard deviation in logit space to real space
+        # we assume a normal distribution in logit space (we used a logit transform and a standard scaler)
+        # and know that the variance of the beta distribution is limited by `scaled_mean * (1 - scaled_mean)`
+        mean_derivative = scaled_mean * (1 - scaled_mean)
+
+        # we can approximate variance as
+        # torch.pow(torch.tanh(target_scale[..., 1].unsqueeze(1) * torch.sqrt(mean_derivative)), 2) * mean_derivative
+        # shape is (positive) parameter * mean_derivative / var
+        shape_scaler = torch.pow(torch.tanh(target_scale[..., 1].unsqueeze(1) * torch.sqrt(mean_derivative)), 2)
+        scaled_shape = F.softplus(parameters[..., 1]) / shape_scaler
+        return torch.stack([scaled_mean, scaled_shape], dim=-1)
diff --git a/tests/test_metrics.py b/tests/test_metrics.py
@@ -10,11 +10,11 @@
     MAE,
     SMAPE,
     AggregationMetric,
+    BetaDistributionLoss,
     CompositeMetric,
     LogNormalDistributionLoss,
     NegativeBinomialDistributionLoss,
     NormalDistributionLoss,
-    BetaDistributionLoss,
 )
 
 
@@ -159,13 +159,26 @@ def test_NegativeBinomialDistributionLoss(center, transformation):
         assert torch.isclose(torch.as_tensor(std), samples.std(), atol=0.1, rtol=0.5)
 
 
-def test_BetaDistributionLoss():
+@pytest.mark.parametrize(
+    ["center", "transformation"],
+    itertools.product([True, False], ["log", "log1p", "softplus", "relu", "logit", None]),
+)
+def test_BetaDistributionLoss(center, transformation):
     initial_mean = 0.1
     initial_shape = 10
     n = 100000
-    target = BetaDistributionLoss()\
-        .map_x_to_distribution(torch.tensor([initial_mean, initial_shape]))\
-        .sample_n(n)
+    target = BetaDistributionLoss().map_x_to_distribution(torch.tensor([initial_mean, initial_shape])).sample_n(n)
+    normalizer = TorchNormalizer(center=center, transformation=transformation)
+    normalized_target = normalizer.fit_transform(target).view(1, -1)
+    target_scale = normalizer.get_parameters().unsqueeze(0)
+    parameters = torch.stack([normalized_target, 1.0 * torch.ones_like(normalized_target)], dim=-1)
+    loss = BetaDistributionLoss()
 
-    mean = target.mean()
-    assert torch.isclose(torch.as_tensor(initial_mean), mean, atol=0.01, rtol=0.01)
+    if transformation not in ["logit"] or not center:
+        with pytest.raises(AssertionError):
+            loss.rescale_parameters(parameters, target_scale=target_scale, encoder=normalizer)
+    else:
+        rescaled_parameters = loss.rescale_parameters(parameters, target_scale=target_scale, encoder=normalizer)
+        samples = loss.sample_n(rescaled_parameters, 1)
+        assert torch.isclose(torch.as_tensor(initial_mean), samples.mean(), atol=0.01, rtol=0.01)  # mean=0.1
+        assert torch.isclose(target.std(), samples.std(), atol=0.02, rtol=0.3)  # std=0.09