Skip to content

Commit

Permalink
remove arviz dependency (#536)
Browse files Browse the repository at this point in the history
* remove arviz dependency

* add simple tests

* fix test_ppe
  • Loading branch information
aloctavodia authored Sep 23, 2024
1 parent f25da81 commit 81a2247
Show file tree
Hide file tree
Showing 9 changed files with 317 additions and 28 deletions.
1 change: 0 additions & 1 deletion environment-dev.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ channels:
dependencies:
- python >= 3.10.0
- pip
- arviz
- matplotlib>=3.5
- numba>=0.59
- numpy>=1.22
Expand Down
1 change: 0 additions & 1 deletion environment-docs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ channels:
- conda-forge
dependencies:
- python >= 3.10
- arviz
- matplotlib>=3.5
- numba>=0.59
- numpy>=1.22
Expand Down
1 change: 0 additions & 1 deletion environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ channels:
- conda-forge
dependencies:
- python >= 3.10
- arviz
- matplotlib>=3.5
- numba>=0.59
- numpy>=1.22
Expand Down
188 changes: 188 additions & 0 deletions preliz/internal/narviz.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
"""Functions originally imported from ArviZ"""
import warnings

import numpy as np
from numba import njit

from scipy.fftpack import fft
from scipy.signal import convolve
from scipy.signal.windows import gaussian

from .optimization import _root


def hdi(ary, hdi_prob=0.94, skipna=True):
"""
Calculate highest density interval (HDI) of array for given probability.
The HDI is the minimum width Bayesian credible interval (BCI).
Parameters
----------
ary: array_like
An array containing the values for which the HDI is to be computed.
hdi_prob: float, optional
Prob for which the highest density interval will be computed. Defaults to 0.94
Returns
-------
np.ndarray with lower and upper values of the interval.
"""
if not 1 >= hdi_prob > 0:
raise ValueError("The value of hdi_prob should be in the interval (0, 1]")

ary = np.ravel(ary)
if skipna:
nans = np.isnan(ary)
if not nans.all():
ary = ary[~nans]
n = len(ary)

ary = np.sort(ary)
interval_idx_inc = int(np.floor(hdi_prob * n))
n_intervals = n - interval_idx_inc
interval_width = np.subtract(ary[interval_idx_inc:], ary[:n_intervals], dtype=np.float64)

if len(interval_width) == 0:
raise ValueError("Too few elements for interval calculation. ")

min_idx = np.argmin(interval_width)
hdi_min = ary[min_idx]
hdi_max = ary[min_idx + interval_idx_inc]
hdi_interval = np.array([hdi_min, hdi_max])

return hdi_interval


def kde(x):
x = x[np.isfinite(x)]
if x.size == 0 or np.all(x == x[0]):
warnings.warn("Your data appears to have a single value or no finite values")

return np.zeros(2), np.array([np.nan] * 2)

grid_len = 256
# Preliminary calculations
x_min = x.min()
x_max = x.max()
x_range = x_max - x_min

# Determine grid
grid_min = x_min
grid_max = x_max

grid_counts, _, grid_edges = histogram(x, grid_len, (grid_min, grid_max))

# Bandwidth estimation

band_w = _bw_isj(x, grid_counts=grid_counts, x_range=x_range)

# Density estimation
grid, pdf = _kde_convolution(x, band_w, grid_edges, grid_counts, grid_len)

return grid, pdf


@njit(cache=True)
def histogram(data, bins, range_hist=None):
"""Jitted histogram.
Parameters
----------
data : array-like
Input data. Passed as first positional argument to ``np.histogram``.
bins : int or array-like
Passed as keyword argument ``bins`` to ``np.histogram``.
range_hist : (float, float), optional
Passed as keyword argument ``range`` to ``np.histogram``.
Returns
-------
hist : array
The number of counts per bin.
density : array
The density corresponding to each bin.
bin_edges : array
The edges of the bins used.
"""
hist, bin_edges = np.histogram(data, bins=bins, range=range_hist)
hist_dens = hist / (hist.sum() * np.diff(bin_edges))
return hist, hist_dens, bin_edges


def _kde_convolution(x, band_w, grid_edges, grid_counts, grid_len):
"""Kernel density with convolution.
One dimensional Gaussian kernel density estimation via convolution of the binned relative
frequencies and a Gaussian filter. This is an internal function used by `kde()`.
"""
# Calculate relative frequencies per bin
bin_width = grid_edges[1] - grid_edges[0]
freq = grid_counts / bin_width / len(x)

# Bandwidth must consider the bin width
band_w /= bin_width

grid = (grid_edges[1:] + grid_edges[:-1]) / 2

kernel_n = int(band_w * 2 * np.pi)
if kernel_n == 0:
kernel_n = 1

kernel = gaussian(kernel_n, band_w)

npad = int(grid_len / 5)
freq = np.concatenate([freq[npad - 1 :: -1], freq, freq[grid_len : grid_len - npad - 1 : -1]])
pdf = convolve(freq, kernel, mode="same", method="direct")[npad : npad + grid_len]
pdf /= band_w * (2 * np.pi) ** 0.5

return grid, pdf


def _bw_isj(x, grid_counts=None, x_range=None):
"""Improved Sheather-Jones bandwidth estimation.
Improved Sheather and Jones method as explained in [1]_. This method is used internally by the
KDE estimator, resulting in saved computation time as minimums, maximums and the grid are
pre-computed.
References
----------
.. [1] Kernel density estimation via diffusion.
Z. I. Botev, J. F. Grotowski, and D. P. Kroese.
Ann. Statist. 38 (2010), no. 5, 2916--2957.
"""
x_len = len(x)
grid_len = len(grid_counts) - 1
# Discrete cosine transform of the data
a_k = _dct1d(grid_counts / x_len)

k_sq = np.arange(1, grid_len) ** 2
a_sq = a_k[range(1, grid_len)] ** 2
return _root(x_len, k_sq, a_sq, x) ** 0.5 * x_range


def _dct1d(x):
"""Discrete Cosine Transform in 1 Dimension.
Parameters
----------
x : numpy array
1 dimensional array of values for which the
DCT is desired
Returns
-------
output : DTC transformed values
"""
x_len = len(x)

even_increasing = np.arange(0, x_len, 2)
odd_decreasing = np.arange(x_len - 1, 0, -2)

x = np.concatenate((x[even_increasing], x[odd_decreasing]))

w_1k = np.r_[1, (2 * np.exp(-(0 + 1j) * (np.arange(1, x_len)) * np.pi / (2 * x_len)))]
output = np.real(w_1k * fft(x))

return output
61 changes: 61 additions & 0 deletions preliz/internal/optimization.py
Original file line number Diff line number Diff line change
Expand Up @@ -491,3 +491,64 @@ def get_weighted_rvs(target, size, rng):
target_rnd_choices = np.random.choice(len(targets), size=size, p=weights)
samples = [target.rvs(size, random_state=rng) for target in targets]
return np.choose(target_rnd_choices, samples)


def _root(n_p, k_sq, a_sq, x):
def _fixed_point(t_s, x_len, k_sq, a_sq):
"""Calculate t-zeta*gamma^[l](t).
Implementation of the function t-zeta*gamma^[l](t) derived from equation (30) in [1].
References
----------
.. [1] Kernel density estimation via diffusion.
Z. I. Botev, J. F. Grotowski, and D. P. Kroese.
Ann. Statist. 38 (2010), no. 5, 2916--2957.
"""
k_sq = np.asarray(k_sq, dtype=np.float64)
a_sq = np.asarray(a_sq, dtype=np.float64)

k_o = 7
func = np.sum(np.power(k_sq, k_o) * a_sq * np.exp(-k_sq * np.pi**2 * t_s))
func *= 0.5 * np.pi ** (2.0 * k_o)

for ite in np.arange(k_o - 1, 2 - 1, -1):
c_1 = (1 + 0.5 ** (ite + 0.5)) / 3
c_2 = np.prod(np.arange(1.0, 2 * ite + 1, 2, dtype=np.float64))
c_2 /= (np.pi / 2) ** 0.5
t_j = np.power((c_1 * (c_2 / (x_len * func))), (2.0 / (3.0 + 2.0 * ite)))
func = np.sum(k_sq**ite * a_sq * np.exp(-k_sq * np.pi**2.0 * t_j))
func *= 0.5 * np.pi ** (2 * ite)

out = t_s - (2 * x_len * np.pi**0.5 * func) ** (-0.4)
return out

# The right bound is at most 0.01
found = False
n_p = max(min(1050, n_p), 50)
tol = 10e-12 + 0.01 * (n_p - 50) / 1000

while not found:
try:
band_w, res = brentq(
_fixed_point, 0, 0.01, args=(n_p, k_sq, a_sq), full_output=True, disp=False
)
found = res.converged
except ValueError:
band_w = 0
tol *= 2.0
found = False
if band_w <= 0 or tol >= 1:
band_w = (_bw_silverman(x) / np.ptp(x)) ** 2
return band_w
return band_w


def _bw_silverman(x, x_std=None): # pylint: disable=unused-argument
"""Silverman's Rule."""
x_std = np.std(x)
q75, q25 = np.percentile(x, [75, 25])
x_iqr = q75 - q25
a = min(x_std, x_iqr / 1.34)
band_w = 0.9 * a * len(x) ** (-0.2)
return band_w
42 changes: 23 additions & 19 deletions preliz/internal/plot_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,12 @@
except ImportError:
pass

from arviz import plot_kde, plot_ecdf, hdi, extract
from arviz.stats.density_utils import _kde_linear
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import _pylab_helpers, get_backend
from matplotlib.ticker import MaxNLocator
from .logging import disable_pymc_sampling_logs
from .narviz import hdi, kde


def plot_pointinterval(distribution, interval="hdi", levels=None, rotated=False, ax=None):
Expand Down Expand Up @@ -442,7 +441,9 @@ def looper(*args, **kwargs):
model.build()
with disable_pymc_sampling_logs():
idata = model.prior_predictive(iterations)
results = extract(idata, group="prior_predictive")[model.response_name].values.T
results = (
idata["prior_predictive"].stack(sample=("chain", "draw"))[model.response_name].values.T
)

_, ax = plt.subplots()
ax.set_xlim(x_min, x_max, auto=auto)
Expand All @@ -469,7 +470,8 @@ def looper(*args, **kwargs):
obs_name = model.observed_RVs[0].name
with disable_pymc_sampling_logs():
idata = sample_prior_predictive(samples=iterations)
results = extract(idata, group="prior_predictive")[obs_name].values.T
results = idata["prior_predictive"].stack(sample=("chain", "draw"))[obs_name].values.T

_, ax = plt.subplots()
ax.set_xlim(x_min, x_max, auto=auto)
if plot_func is None:
Expand Down Expand Up @@ -508,8 +510,8 @@ def plot_repr(results, kind_plot, references, iterations, ax):
)
elif kind_plot == "kde":
for result in results:
ax.plot(*_kde_linear(result, grid_len=100), "0.5", alpha=alpha)
ax.plot(*_kde_linear(np.concatenate(results), grid_len=100), "k--")
ax.plot(*kde(result), "0.5", alpha=alpha)
ax.plot(*kde(np.concatenate(results)), "k--")
elif kind_plot == "ecdf":
ax.plot(
np.sort(results, axis=1).T,
Expand Down Expand Up @@ -555,15 +557,13 @@ def plot_pp_samples(pp_samples, pp_samples_idxs, references, kind="pdf", sharex=
x_lims[1] = max_

if kind == "pdf":
plot_kde(sample, ax=ax, plot_kwargs={"color": "C0"}) # pylint:disable=no-member
plot_kde(sample, ax=ax, color="C0")
elif kind == "hist":
bins, *_ = ax.hist(
sample, color="C0", bins="auto", alpha=0.5, density=True
) # pylint:disable=no-member
bins, *_ = ax.hist(sample, color="C0", bins="auto", alpha=0.5, density=True)
ax.set_ylim(-bins.max() * 0.05, None)

elif kind == "ecdf":
plot_ecdf(sample, ax=ax, plot_kwargs={"color": "C0"}) # pylint:disable=no-member
ax.hist(sample, color="C0")

plot_pointinterval(sample, ax=ax)
ax.set_title(idx, alpha=0)
Expand Down Expand Up @@ -596,18 +596,17 @@ def plot_pp_mean(pp_samples, selected, references=None, kind="pdf", fig_pp_mean=

if kind == "pdf":
plot_kde(
sample, ax=ax_pp_mean, plot_kwargs={"color": "k", "linestyle": "--"}
) # pylint:disable=no-member
sample,
ax=ax_pp_mean,
color="C0",
linestyle="--",
)
elif kind == "hist":
bins, *_ = ax_pp_mean.hist(
sample, color="k", ls="--", bins="auto", alpha=0.5, density=True
) # pylint:disable=no-member
bins, *_ = ax_pp_mean.hist(sample, color="k", ls="--", bins="auto", alpha=0.5, density=True)
ax_pp_mean.set_ylim(-bins.max() * 0.05, None)

elif kind == "ecdf":
plot_ecdf(
sample, ax=ax_pp_mean, plot_kwargs={"color": "k", "linestyle": "--"}
) # pylint:disable=no-member
ax_pp_mean.ecdf(sample, color="k", linestyle="--")

plot_pointinterval(sample, ax=ax_pp_mean)
ax_pp_mean.set_yticks([])
Expand All @@ -630,6 +629,11 @@ def plot_references(references, ax):
ax.axvline(ref, ls="--", color="0.5")


def plot_kde(sample, ax, **kwargs):
grid, pdf = kde(sample)
ax.plot(grid, pdf, **kwargs)


def check_inside_notebook(need_widget=False):
shell = get_ipython()
name = inspect.currentframe().f_back.f_code.co_name
Expand Down
Loading

0 comments on commit 81a2247

Please sign in to comment.