remove arviz dependency (#536)

* remove arviz dependency * add simple tests * fix test_ppe
arviz-devs · Sep 23, 2024 · 81a2247 · 81a2247
1 parent f25da81
commit 81a2247
Show file tree

Hide file tree

Showing 9 changed files with 317 additions and 28 deletions.
diff --git a/environment-dev.yml b/environment-dev.yml
@@ -5,7 +5,6 @@ channels:
 dependencies:
   - python >= 3.10.0
   - pip
-  - arviz
   - matplotlib>=3.5
   - numba>=0.59
   - numpy>=1.22

diff --git a/environment-docs.yml b/environment-docs.yml
@@ -4,7 +4,6 @@ channels:
   - conda-forge
 dependencies:
   - python >= 3.10
-  - arviz
   - matplotlib>=3.5
   - numba>=0.59
   - numpy>=1.22

diff --git a/environment.yml b/environment.yml
@@ -4,7 +4,6 @@ channels:
   - conda-forge
 dependencies:
   - python >= 3.10
-  - arviz
   - matplotlib>=3.5
   - numba>=0.59
   - numpy>=1.22

diff --git a/preliz/internal/narviz.py b/preliz/internal/narviz.py
@@ -0,0 +1,188 @@
+"""Functions originally imported from ArviZ"""
+import warnings
+
+import numpy as np
+from numba import njit
+
+from scipy.fftpack import fft
+from scipy.signal import convolve
+from scipy.signal.windows import gaussian
+
+from .optimization import _root
+
+
+def hdi(ary, hdi_prob=0.94, skipna=True):
+    """
+    Calculate highest density interval (HDI) of array for given probability.
+
+    The HDI is the minimum width Bayesian credible interval (BCI).
+
+    Parameters
+    ----------
+    ary: array_like
+        An array containing the values for which the HDI is to be computed.
+    hdi_prob: float, optional
+        Prob for which the highest density interval will be computed. Defaults to 0.94
+
+    Returns
+    -------
+    np.ndarray with lower and upper values of the interval.
+    """
+    if not 1 >= hdi_prob > 0:
+        raise ValueError("The value of hdi_prob should be in the interval (0, 1]")
+
+    ary = np.ravel(ary)
+    if skipna:
+        nans = np.isnan(ary)
+        if not nans.all():
+            ary = ary[~nans]
+    n = len(ary)
+
+    ary = np.sort(ary)
+    interval_idx_inc = int(np.floor(hdi_prob * n))
+    n_intervals = n - interval_idx_inc
+    interval_width = np.subtract(ary[interval_idx_inc:], ary[:n_intervals], dtype=np.float64)
+
+    if len(interval_width) == 0:
+        raise ValueError("Too few elements for interval calculation. ")
+
+    min_idx = np.argmin(interval_width)
+    hdi_min = ary[min_idx]
+    hdi_max = ary[min_idx + interval_idx_inc]
+    hdi_interval = np.array([hdi_min, hdi_max])
+
+    return hdi_interval
+
+
+def kde(x):
+    x = x[np.isfinite(x)]
+    if x.size == 0 or np.all(x == x[0]):
+        warnings.warn("Your data appears to have a single value or no finite values")
+
+        return np.zeros(2), np.array([np.nan] * 2)
+
+    grid_len = 256
+    # Preliminary calculations
+    x_min = x.min()
+    x_max = x.max()
+    x_range = x_max - x_min
+
+    # Determine grid
+    grid_min = x_min
+    grid_max = x_max
+
+    grid_counts, _, grid_edges = histogram(x, grid_len, (grid_min, grid_max))
+
+    # Bandwidth estimation
+
+    band_w = _bw_isj(x, grid_counts=grid_counts, x_range=x_range)
+
+    # Density estimation
+    grid, pdf = _kde_convolution(x, band_w, grid_edges, grid_counts, grid_len)
+
+    return grid, pdf
+
+
+@njit(cache=True)
+def histogram(data, bins, range_hist=None):
+    """Jitted histogram.
+
+    Parameters
+    ----------
+    data : array-like
+        Input data. Passed as first positional argument to ``np.histogram``.
+    bins : int or array-like
+        Passed as keyword argument ``bins`` to ``np.histogram``.
+    range_hist : (float, float), optional
+        Passed as keyword argument ``range`` to ``np.histogram``.
+
+    Returns
+    -------
+    hist : array
+        The number of counts per bin.
+    density : array
+        The density corresponding to each bin.
+    bin_edges : array
+        The edges of the bins used.
+    """
+    hist, bin_edges = np.histogram(data, bins=bins, range=range_hist)
+    hist_dens = hist / (hist.sum() * np.diff(bin_edges))
+    return hist, hist_dens, bin_edges
+
+
+def _kde_convolution(x, band_w, grid_edges, grid_counts, grid_len):
+    """Kernel density with convolution.
+
+    One dimensional Gaussian kernel density estimation via convolution of the binned relative
+    frequencies and a Gaussian filter. This is an internal function used by `kde()`.
+    """
+    # Calculate relative frequencies per bin
+    bin_width = grid_edges[1] - grid_edges[0]
+    freq = grid_counts / bin_width / len(x)
+
+    # Bandwidth must consider the bin width
+    band_w /= bin_width
+
+    grid = (grid_edges[1:] + grid_edges[:-1]) / 2
+
+    kernel_n = int(band_w * 2 * np.pi)
+    if kernel_n == 0:
+        kernel_n = 1
+
+    kernel = gaussian(kernel_n, band_w)
+
+    npad = int(grid_len / 5)
+    freq = np.concatenate([freq[npad - 1 :: -1], freq, freq[grid_len : grid_len - npad - 1 : -1]])
+    pdf = convolve(freq, kernel, mode="same", method="direct")[npad : npad + grid_len]
+    pdf /= band_w * (2 * np.pi) ** 0.5
+
+    return grid, pdf
+
+
+def _bw_isj(x, grid_counts=None, x_range=None):
+    """Improved Sheather-Jones bandwidth estimation.
+
+    Improved Sheather and Jones method as explained in [1]_. This method is used internally by the
+    KDE estimator, resulting in saved computation time as minimums, maximums and the grid are
+    pre-computed.
+
+    References
+    ----------
+    .. [1] Kernel density estimation via diffusion.
+       Z. I. Botev, J. F. Grotowski, and D. P. Kroese.
+       Ann. Statist. 38 (2010), no. 5, 2916--2957.
+    """
+    x_len = len(x)
+    grid_len = len(grid_counts) - 1
+    # Discrete cosine transform of the data
+    a_k = _dct1d(grid_counts / x_len)
+
+    k_sq = np.arange(1, grid_len) ** 2
+    a_sq = a_k[range(1, grid_len)] ** 2
+    return _root(x_len, k_sq, a_sq, x) ** 0.5 * x_range
+
+
+def _dct1d(x):
+    """Discrete Cosine Transform in 1 Dimension.
+
+    Parameters
+    ----------
+    x : numpy array
+        1 dimensional array of values for which the
+        DCT is desired
+
+    Returns
+    -------
+    output : DTC transformed values
+    """
+    x_len = len(x)
+
+    even_increasing = np.arange(0, x_len, 2)
+    odd_decreasing = np.arange(x_len - 1, 0, -2)
+
+    x = np.concatenate((x[even_increasing], x[odd_decreasing]))
+
+    w_1k = np.r_[1, (2 * np.exp(-(0 + 1j) * (np.arange(1, x_len)) * np.pi / (2 * x_len)))]
+    output = np.real(w_1k * fft(x))
+
+    return output
diff --git a/preliz/internal/optimization.py b/preliz/internal/optimization.py
@@ -491,3 +491,64 @@ def get_weighted_rvs(target, size, rng):
     target_rnd_choices = np.random.choice(len(targets), size=size, p=weights)
     samples = [target.rvs(size, random_state=rng) for target in targets]
     return np.choose(target_rnd_choices, samples)
+
+
+def _root(n_p, k_sq, a_sq, x):
+    def _fixed_point(t_s, x_len, k_sq, a_sq):
+        """Calculate t-zeta*gamma^[l](t).
+
+        Implementation of the function t-zeta*gamma^[l](t) derived from equation (30) in [1].
+
+        References
+        ----------
+        .. [1] Kernel density estimation via diffusion.
+        Z. I. Botev, J. F. Grotowski, and D. P. Kroese.
+        Ann. Statist. 38 (2010), no. 5, 2916--2957.
+        """
+        k_sq = np.asarray(k_sq, dtype=np.float64)
+        a_sq = np.asarray(a_sq, dtype=np.float64)
+
+        k_o = 7
+        func = np.sum(np.power(k_sq, k_o) * a_sq * np.exp(-k_sq * np.pi**2 * t_s))
+        func *= 0.5 * np.pi ** (2.0 * k_o)
+
+        for ite in np.arange(k_o - 1, 2 - 1, -1):
+            c_1 = (1 + 0.5 ** (ite + 0.5)) / 3
+            c_2 = np.prod(np.arange(1.0, 2 * ite + 1, 2, dtype=np.float64))
+            c_2 /= (np.pi / 2) ** 0.5
+            t_j = np.power((c_1 * (c_2 / (x_len * func))), (2.0 / (3.0 + 2.0 * ite)))
+            func = np.sum(k_sq**ite * a_sq * np.exp(-k_sq * np.pi**2.0 * t_j))
+            func *= 0.5 * np.pi ** (2 * ite)
+
+        out = t_s - (2 * x_len * np.pi**0.5 * func) ** (-0.4)
+        return out
+
+    # The right bound is at most 0.01
+    found = False
+    n_p = max(min(1050, n_p), 50)
+    tol = 10e-12 + 0.01 * (n_p - 50) / 1000
+
+    while not found:
+        try:
+            band_w, res = brentq(
+                _fixed_point, 0, 0.01, args=(n_p, k_sq, a_sq), full_output=True, disp=False
+            )
+            found = res.converged
+        except ValueError:
+            band_w = 0
+            tol *= 2.0
+            found = False
+        if band_w <= 0 or tol >= 1:
+            band_w = (_bw_silverman(x) / np.ptp(x)) ** 2
+            return band_w
+    return band_w
+
+
+def _bw_silverman(x, x_std=None):  # pylint: disable=unused-argument
+    """Silverman's Rule."""
+    x_std = np.std(x)
+    q75, q25 = np.percentile(x, [75, 25])
+    x_iqr = q75 - q25
+    a = min(x_std, x_iqr / 1.34)
+    band_w = 0.9 * a * len(x) ** (-0.2)
+    return band_w
diff --git a/preliz/internal/plot_helper.py b/preliz/internal/plot_helper.py
@@ -9,13 +9,12 @@
 except ImportError:
     pass
 
-from arviz import plot_kde, plot_ecdf, hdi, extract
-from arviz.stats.density_utils import _kde_linear
 import numpy as np
 import matplotlib.pyplot as plt
 from matplotlib import _pylab_helpers, get_backend
 from matplotlib.ticker import MaxNLocator
 from .logging import disable_pymc_sampling_logs
+from .narviz import hdi, kde
 
 
 def plot_pointinterval(distribution, interval="hdi", levels=None, rotated=False, ax=None):
@@ -442,7 +441,9 @@ def looper(*args, **kwargs):
         model.build()
         with disable_pymc_sampling_logs():
             idata = model.prior_predictive(iterations)
-        results = extract(idata, group="prior_predictive")[model.response_name].values.T
+        results = (
+            idata["prior_predictive"].stack(sample=("chain", "draw"))[model.response_name].values.T
+        )
 
         _, ax = plt.subplots()
         ax.set_xlim(x_min, x_max, auto=auto)
@@ -469,7 +470,8 @@ def looper(*args, **kwargs):
             obs_name = model.observed_RVs[0].name
             with disable_pymc_sampling_logs():
                 idata = sample_prior_predictive(samples=iterations)
-            results = extract(idata, group="prior_predictive")[obs_name].values.T
+            results = idata["prior_predictive"].stack(sample=("chain", "draw"))[obs_name].values.T
+
         _, ax = plt.subplots()
         ax.set_xlim(x_min, x_max, auto=auto)
         if plot_func is None:
@@ -508,8 +510,8 @@ def plot_repr(results, kind_plot, references, iterations, ax):
         )
     elif kind_plot == "kde":
         for result in results:
-            ax.plot(*_kde_linear(result, grid_len=100), "0.5", alpha=alpha)
-        ax.plot(*_kde_linear(np.concatenate(results), grid_len=100), "k--")
+            ax.plot(*kde(result), "0.5", alpha=alpha)
+        ax.plot(*kde(np.concatenate(results)), "k--")
     elif kind_plot == "ecdf":
         ax.plot(
             np.sort(results, axis=1).T,
@@ -555,15 +557,13 @@ def plot_pp_samples(pp_samples, pp_samples_idxs, references, kind="pdf", sharex=
                 x_lims[1] = max_
 
         if kind == "pdf":
-            plot_kde(sample, ax=ax, plot_kwargs={"color": "C0"})  # pylint:disable=no-member
+            plot_kde(sample, ax=ax, color="C0")
         elif kind == "hist":
-            bins, *_ = ax.hist(
-                sample, color="C0", bins="auto", alpha=0.5, density=True
-            )  # pylint:disable=no-member
+            bins, *_ = ax.hist(sample, color="C0", bins="auto", alpha=0.5, density=True)
             ax.set_ylim(-bins.max() * 0.05, None)
 
         elif kind == "ecdf":
-            plot_ecdf(sample, ax=ax, plot_kwargs={"color": "C0"})  # pylint:disable=no-member
+            ax.hist(sample, color="C0")
 
         plot_pointinterval(sample, ax=ax)
         ax.set_title(idx, alpha=0)
@@ -596,18 +596,17 @@ def plot_pp_mean(pp_samples, selected, references=None, kind="pdf", fig_pp_mean=
 
     if kind == "pdf":
         plot_kde(
-            sample, ax=ax_pp_mean, plot_kwargs={"color": "k", "linestyle": "--"}
-        )  # pylint:disable=no-member
+            sample,
+            ax=ax_pp_mean,
+            color="C0",
+            linestyle="--",
+        )
     elif kind == "hist":
-        bins, *_ = ax_pp_mean.hist(
-            sample, color="k", ls="--", bins="auto", alpha=0.5, density=True
-        )  # pylint:disable=no-member
+        bins, *_ = ax_pp_mean.hist(sample, color="k", ls="--", bins="auto", alpha=0.5, density=True)
         ax_pp_mean.set_ylim(-bins.max() * 0.05, None)
 
     elif kind == "ecdf":
-        plot_ecdf(
-            sample, ax=ax_pp_mean, plot_kwargs={"color": "k", "linestyle": "--"}
-        )  # pylint:disable=no-member
+        ax_pp_mean.ecdf(sample, color="k", linestyle="--")
 
     plot_pointinterval(sample, ax=ax_pp_mean)
     ax_pp_mean.set_yticks([])
@@ -630,6 +629,11 @@ def plot_references(references, ax):
                 ax.axvline(ref, ls="--", color="0.5")
 
 
+def plot_kde(sample, ax, **kwargs):
+    grid, pdf = kde(sample)
+    ax.plot(grid, pdf, **kwargs)
+
+
 def check_inside_notebook(need_widget=False):
     shell = get_ipython()
     name = inspect.currentframe().f_back.f_code.co_name