Skip to content

Commit

Permalink
updated silvermans_rule to work with multidimensional data
Browse files Browse the repository at this point in the history
  • Loading branch information
bean217 committed Mar 14, 2024
1 parent d7bea3f commit 13f7380
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 17 deletions.
27 changes: 11 additions & 16 deletions KDEpy/bw_selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,10 +243,6 @@ def silvermans_rule(data, weights=None):
Returns optimal smoothing (standard deviation) if the data is close to
normal.
TODO: Extend to multidimensional:
https://docs.scipy.org/doc/scipy-0.13.0/reference/generated/scipy.
stats.gaussian_kde.html#r216
Examples
--------
>>> data = np.arange(9).reshape(-1, 1)
Expand All @@ -256,43 +252,42 @@ def silvermans_rule(data, weights=None):
if not len(data.shape) == 2:
raise ValueError("Data must be of shape (obs, dims).")
obs, dims = data.shape
if not dims == 1:
raise ValueError("Silverman's rule is only available for 1D data.")

if weights is not None:
warnings.warn("Silverman's rule currently ignores all weights")

if obs == 1:
return 1
return 1.0 if dims < 2 else np.asarray([1.0] * dims)
if obs < 1:
raise ValueError("Data must be of length > 0.")

sigma = np.std(data, ddof=1)
sigma = np.std(data, axis=0)
# scipy.stats.norm.ppf(.75) - scipy.stats.norm.ppf(.25) -> 1.3489795003921634
IQR = (np.percentile(data, q=75) - np.percentile(data, q=25)) / 1.3489795003921634
IQR = (np.percentile(data, axis=0, q=75) - np.percentile(data, axis=0, q=25)) / 1.3489795003921634

sigma = min(sigma, IQR)
sigma = np.min(np.stack([sigma, IQR]), axis=0)

# The logic below is not related to silverman's rule, but if the data is constant
# it's nice to return a value instead of getting an error. A warning will be raised.
if sigma > 0:
return sigma * (obs * 3 / 4.0) ** (-1 / 5)
if np.min(sigma) > 0:
res = sigma * (obs * 3 / 4.0) ** (-1 / 5)
return res[0] if dims < 2 else res
else:
# stats.norm.ppf(.99) - stats.norm.ppf(.01) = 4.6526957480816815
IQR = (np.percentile(data, q=99) - np.percentile(data, q=1)) / 4.6526957480816815
if IQR > 0:
IQR = (np.percentile(data, axis=0, q=99) - np.percentile(data, axis=0, q=1)) / 4.6526957480816815
if np.min(IQR) > 0:
bw = IQR * (obs * 3 / 4.0) ** (-1 / 5)
warnings.warn(
"Silverman's rule failed. Too many idential values. \
Setting bw = {}".format(
bw
)
)
return bw
return bw[0] if dims < 2 else bw

# Here, all values are basically constant
warnings.warn("Silverman's rule failed. Too many idential values. Setting bw = 1.0")
return 1.0
return 1.0 if dims < 2 else np.asarray([1.0] * dims)


_bw_methods = {
Expand Down
26 changes: 25 additions & 1 deletion KDEpy/tests/test_bw_selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,17 @@
import numpy as np
import pytest

from KDEpy.bw_selection import _bw_methods, improved_sheather_jones
from KDEpy.bw_selection import _bw_methods, improved_sheather_jones, silvermans_rule


@pytest.fixture(scope="module")
def data() -> np.ndarray:
return np.random.randn(100, 1)

@pytest.fixture(scope="module")
def multidim_data() -> np.ndarray:
return np.random.randn(100, 2)


@pytest.mark.parametrize("method", _bw_methods.values())
def test_equal_weights_dont_changed_bw(data, method):
Expand All @@ -23,6 +27,13 @@ def test_equal_weights_dont_changed_bw(data, method):
np.testing.assert_almost_equal(bw_no_weights, bw_weighted)


def test_multidim_silvermans_rule_weights_dont_changed_bw(multidim_data):
weights = np.ones_like(multidim_data).squeeze() * 2
bw_no_weights = silvermans_rule(multidim_data, weights=None)
bw_weighted = silvermans_rule(multidim_data, weights=weights)
np.testing.assert_almost_equal(bw_no_weights, bw_weighted)


def test_isj_bw_weights_single_zero_weighted_point(data):
data_with_outlier = np.concatenate((data.copy(), np.array([[1000]])))
weights = np.ones_like(data_with_outlier).squeeze()
Expand All @@ -45,6 +56,19 @@ def test_isj_bw_weights_same_as_resampling(data, execution_number):
)


def test_onedim_silvermans_rule_shape(data):
sr_res = silvermans_rule(data)
# dims is a float
assert isinstance(sr_res, float)


def test_multidim_silvermans_rule_shape(multidim_data):
sr_res = silvermans_rule(multidim_data)
# dims shape is 2
dim = sr_res.shape[0]
assert dim == 2


if __name__ == "__main__":
# --durations=10 <- May be used to show potentially slow tests
pytest.main(args=[__file__, "--doctest-modules", "-v", "--durations=15"])

0 comments on commit 13f7380

Please sign in to comment.