From 0f0248c9c8c818a29272e7a658a3f92827db110c Mon Sep 17 00:00:00 2001 From: bhavikar04 <77113227+bhavikar04@users.noreply.github.com> Date: Thu, 18 Apr 2024 13:38:24 +0530 Subject: [PATCH] [ENH] Lognormal probability distribution (#218) Towards #22 #### What does this implement/fix? Explain your changes. Lognormal probability distribution --- skpro/distributions/__init__.py | 2 + skpro/distributions/lognormal.py | 174 +++++++++++++++++++++++++++++++ 2 files changed, 176 insertions(+) create mode 100644 skpro/distributions/lognormal.py diff --git a/skpro/distributions/__init__.py b/skpro/distributions/__init__.py index b6af5e6ea..96393fafa 100644 --- a/skpro/distributions/__init__.py +++ b/skpro/distributions/__init__.py @@ -7,6 +7,7 @@ "Laplace", "Mixture", "Normal", + "LogNormal", "Poisson", "QPD_S", "QPD_B", @@ -17,6 +18,7 @@ from skpro.distributions.empirical import Empirical from skpro.distributions.laplace import Laplace +from skpro.distributions.lognormal import LogNormal from skpro.distributions.mixture import Mixture from skpro.distributions.normal import Normal from skpro.distributions.poisson import Poisson diff --git a/skpro/distributions/lognormal.py b/skpro/distributions/lognormal.py new file mode 100644 index 000000000..a89caf467 --- /dev/null +++ b/skpro/distributions/lognormal.py @@ -0,0 +1,174 @@ +# copyright: sktime developers, BSD-3-Clause License (see LICENSE file) +"""Log-Normal probability distribution.""" + +__author__ = ["bhavikar04", "fkiraly"] + +import numpy as np +import pandas as pd +from scipy.special import erf, erfinv + +from skpro.distributions.base import BaseDistribution + + +class LogNormal(BaseDistribution): + """Log-Normal distribution (skpro native). + + Parameters + ---------- + mean : float or array of float (1D or 2D) + mean of the logarithm of the distribution + sd : float or array of float (1D or 2D), must be positive + standard deviation the logarithm of the distribution + index : pd.Index, optional, default = RangeIndex + columns : pd.Index, optional, default = RangeIndex + + Example + ------- + >>> from skpro.distributions.lognormal import LogNormal + + >>> n = LogNormal(mu=[[0, 1], [2, 3], [4, 5]], sigma=1) + """ + + _tags = { + "authors": ["bhavikar04", "fkiraly"], + "capabilities:approx": ["pdflognorm"], + "capabilities:exact": ["mean", "var", "energy", "pdf", "log_pdf", "cdf", "ppf"], + "distr:measuretype": "continuous", + } + + def __init__(self, mu, sigma, index=None, columns=None): + self.mu = mu + self.sigma = sigma + self.index = index + self.columns = columns + + # todo: untangle index handling + # and broadcast of parameters. + # move this functionality to the base class + self._mu, self._sigma = self._get_bc_params() + shape = self._mu.shape + + if index is None: + index = pd.RangeIndex(shape[0]) + + if columns is None: + columns = pd.RangeIndex(shape[1]) + + super().__init__(index=index, columns=columns) + + def _get_bc_params(self): + """Fully broadcast parameters of self, given param shapes and index, columns.""" + to_broadcast = [self.mu, self.sigma] + if hasattr(self, "index") and self.index is not None: + to_broadcast += [self.index.to_numpy().reshape(-1, 1)] + if hasattr(self, "columns") and self.columns is not None: + to_broadcast += [self.columns.to_numpy()] + bc = np.broadcast_arrays(*to_broadcast) + return bc[0], bc[1] + + # commented out, seems incorrect + # def energy(self, x=None): + # r"""Energy of self, w.r.t. self or a constant frame x. + + # Let :math:`X, Y` be i.i.d. random variables with the distribution of `self`. + + # If `x` is `None`, returns :math:`\mathbb{E}[|X-Y|]` (per row), "self-energy". + # If `x` is passed, returns :math:`\mathbb{E}[|X-x|]-0.5\mathbb{E}[|X-Y|]` + # (per row), "CRPS wrt x". + + # Parameters + # ---------- + # x : None or pd.DataFrame, optional, default=None + # if pd.DataFrame, must have same rows and columns as `self` + + # Returns + # ------- + # pd.DataFrame with same rows as `self`, single column `"energy"` + # each row contains one float, self-energy/energy as described above. + # """ + # if x is None: + # return super().energy(x) + # # explicit formula for CRPS of log-normal cross-term + # # obtained by bhavikar04 via wolfram alpha + # else: + # d = self.loc[x.index, x.columns] + # mu_arr, sd_arr = d._mu, d._sigma + # c_arr = x * (2 * self.cdf(x) - 1) + # c_arr2 = -2 * np.exp((mu_arr + sd_arr**2) / 2) + # c_arr3 = self.cdf((np.log(x) - mu_arr - sd_arr**2) / sd_arr) + # c_arr3 = c_arr3 + self.cdf(sd_arr / mu_arr**0.5) - 1 + # c_arr2 = c_arr2 * c_arr3 + # c_arr = c_arr + c_arr2 + + # energy_arr = np.sum(c_arr, axis=1) + # energy = pd.DataFrame(energy_arr, index=self.index, columns=["energy"]) + # return energy + + def mean(self): + r"""Return expected value of the distribution. + + Let :math:`X` be a random variable with the distribution of `self`. + Returns the expectation :math:`\mathbb{E}[X]` + + Returns + ------- + pd.DataFrame with same rows, columns as `self` + expected value of distribution (entry-wise) + """ + mean_arr = np.exp(self._mu + self._sigma**2 / 2) + return pd.DataFrame(mean_arr, index=self.index, columns=self.columns) + + def var(self): + r"""Return element/entry-wise variance of the distribution. + + Let :math:`X` be a random variable with the distribution of `self`. + Returns :math:`\mathbb{V}[X] = \mathbb{E}\left(X - \mathbb{E}[X]\right)^2` + + Returns + ------- + pd.DataFrame with same rows, columns as `self` + variance of distribution (entry-wise) + """ + mu = self._mu + sigma = self._sigma + sd_arr = np.exp(2 * mu + 2 * sigma**2) - np.exp(2 * mu + sigma**2) + return pd.DataFrame(sd_arr, index=self.index, columns=self.columns) ** 2 + + def pdf(self, x): + """Probability density function.""" + d = self.loc[x.index, x.columns] + pdf_arr = np.exp(-0.5 * ((np.log(x.values) - d.mu) / d.sigma) ** 2) + pdf_arr = pdf_arr / (x.values * d.sigma * np.sqrt(2 * np.pi)) + return pd.DataFrame(pdf_arr, index=x.index, columns=x.columns) + + def log_pdf(self, x): + """Logarithmic probability density function.""" + d = self.loc[x.index, x.columns] + lpdf_arr = -0.5 * ((np.log(x.values) - d.mu) / d.sigma) ** 2 + lpdf_arr = lpdf_arr - np.log(x.values * d.sigma * np.sqrt(2 * np.pi)) + return pd.DataFrame(lpdf_arr, index=x.index, columns=x.columns) + + def cdf(self, x): + """Cumulative distribution function.""" + d = self.loc[x.index, x.columns] + cdf_arr = 0.5 + 0.5 * erf((np.log(x.values) - d.mu) / (d.sigma * np.sqrt(2))) + return pd.DataFrame(cdf_arr, index=x.index, columns=x.columns) + + def ppf(self, p): + """Quantile function = percent point function = inverse cdf.""" + d = self.loc[p.index, p.columns] + icdf_arr = d.mu + d.sigma * np.sqrt(2) * erfinv(2 * p.values - 1) + icdf_arr = np.exp(self._sigma * icdf_arr) + return pd.DataFrame(icdf_arr, index=p.index, columns=p.columns) + + @classmethod + def get_test_params(cls, parameter_set="default"): + """Return testing parameter settings for the estimator.""" + params1 = {"mu": [[0, 1], [2, 3], [4, 5]], "sigma": 1} + params2 = { + "mu": 0, + "sigma": 1, + "index": pd.Index([1, 2, 5]), + "columns": pd.Index(["a", "b"]), + } + return [params1, params2]