Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add kernel option, simpler chi2 calculation #21

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 34 additions & 12 deletions src/pqm/pqm.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
from typing import Optional, Union, Tuple
from typing import Optional, Union, Tuple, Callable
import warnings

import torch
import numpy as np
from scipy.stats import chi2_contingency
from scipy.stats import chi2_contingency, chi2

from .utils import (
_mean_std_numpy,
Expand All @@ -12,7 +12,6 @@
_sample_reference_indices_torch,
_compute_counts_numpy,
_compute_counts_torch,
_rescale_chi2,
)

__all__ = ("pqm_chi2", "pqm_pvalue")
Expand All @@ -25,6 +24,7 @@ def _pqm_test(
z_score_norm: bool,
x_frac: Optional[float],
gauss_frac: float,
kernel: Union[str, Callable] = "euclidean",
) -> Tuple:
"""
Helper function to perform the PQM test and return the results from
Expand All @@ -46,8 +46,8 @@ def _pqm_test(
Number of times _pqm_test is called, re-tesselating the space. No
re_tessellation if None (default).
z_score_norm : bool
If True, z_score_norm the samples by subtracting the mean and dividing by the
standard deviation. mean and std are calculated from the combined
If True, z_score_norm the samples by subtracting the mean and dividing
by the standard deviation. mean and std are calculated from the combined
x_samples and y_samples.
x_frac : float
Fraction of x_samples to use as reference samples. ``x_frac = 1`` will
Expand All @@ -60,6 +60,12 @@ def _pqm_test(
determined from the combined x_samples/y_samples. This ensures full
support of the reference samples if pathological behavior is expected.
Default: 0.0 no gaussian samples.
kernel : str or callable
Kernel function to use for distance calculation. If a string, must be
one of 'euclidean', 'cityblock', 'cosine', 'chebyshev', 'canberra', or
'correlation' (see ``scipy.distance.cdist``). If a callable, must take
two vectors and return a scalar, should also be commutative. This only
works for numpy array inputs. Default: 'euclidean'.

Note
----
Expand Down Expand Up @@ -143,7 +149,7 @@ def _pqm_test(
refs, x_samples, y_samples = _sample_reference_indices_numpy(
Nx, Ny, Ng, x_samples, y_samples
)
counts_x, counts_y = _compute_counts_numpy(x_samples, y_samples, refs, num_refs)
counts_x, counts_y = _compute_counts_numpy(x_samples, y_samples, refs, num_refs, kernel)

# Remove references with no counts
C = (counts_x > 0) | (counts_y > 0)
Expand Down Expand Up @@ -183,6 +189,7 @@ def pqm_pvalue(
z_score_norm: bool = False,
x_frac: Optional[float] = None,
gauss_frac: float = 0.0,
kernel: str = "euclidean",
):
"""
Perform the PQM test of the null hypothesis that `x_samples` and `y_samples`
Expand Down Expand Up @@ -219,6 +226,12 @@ def pqm_pvalue(
determined from the combined x_samples/y_samples. This ensures full
support of the reference samples if pathological behavior is expected.
Default: 0.0 no gaussian samples.
kernel : str or callable
Kernel function to use for distance calculation. If a string, must be
one of 'euclidean', 'cityblock', 'cosine', 'chebyshev', 'canberra', or
'correlation' (see ``scipy.distance.cdist``). If a callable, must take
two vectors and return a scalar, should also be commutative. This only
works for numpy array inputs. Default: 'euclidean'.

Note
----
Expand Down Expand Up @@ -250,11 +263,14 @@ def pqm_pvalue(
z_score_norm=z_score_norm,
x_frac=x_frac,
gauss_frac=gauss_frac,
kernel=kernel,
)
for _ in range(re_tessellation)
]

_, p_value, _, _ = _pqm_test(x_samples, y_samples, num_refs, z_score_norm, x_frac, gauss_frac)
_, p_value, _, _ = _pqm_test(
x_samples, y_samples, num_refs, z_score_norm, x_frac, gauss_frac, kernel
)

# Return p-value as a float
return p_value
Expand All @@ -268,6 +284,7 @@ def pqm_chi2(
z_score_norm: bool = False,
x_frac: Optional[float] = None,
gauss_frac: float = 0.0,
kernel: str = "euclidean",
):
"""
Perform the PQM test of the null hypothesis that `x_samples` and `y_samples`
Expand Down Expand Up @@ -304,6 +321,12 @@ def pqm_chi2(
determined from the combined x_samples/y_samples. This ensures full
support of the reference samples if pathological behavior is expected.
Default: 0.0 no gaussian samples.
kernel : str or callable
Kernel function to use for distance calculation. If a string, must be
one of 'euclidean', 'cityblock', 'cosine', 'chebyshev', 'canberra', or
'correlation' (see ``scipy.distance.cdist``). If a callable, must take
two vectors and return a scalar, should also be commutative. This only
works for numpy array inputs. Default: 'euclidean'.

Note
----
Expand Down Expand Up @@ -344,17 +367,16 @@ def pqm_chi2(
z_score_norm=z_score_norm,
x_frac=x_frac,
gauss_frac=gauss_frac,
kernel=kernel,
)
for _ in range(re_tessellation)
]

chi2_stat, _, dof, _ = _pqm_test(
x_samples, y_samples, num_refs, z_score_norm, x_frac, gauss_frac
_, p_value, _, _ = _pqm_test(
x_samples, y_samples, num_refs, z_score_norm, x_frac, gauss_frac, kernel
)

# Rescale chi2 statistic if necessary
if dof != num_refs - 1:
chi2_stat = _rescale_chi2(chi2_stat, dof, num_refs - 1)
chi2_stat = chi2.isf(p_value, num_refs - 1)

# Return chi2_stat as a float
return chi2_stat
28 changes: 8 additions & 20 deletions src/pqm/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
__all__ = (
"_mean_std_torch",
"_mean_std_numpy",
"_rescale_chi2",
"_chi2_contingency_torch",
"_sample_reference_indices_numpy",
"_compute_counts_numpy",
Expand Down Expand Up @@ -51,19 +50,6 @@ def _mean_std_numpy(sample1, sample2):
return m, s


def _rescale_chi2(chi2_stat, orig_dof, target_dof):
"""
Rescale chi2 statistic using appropriate methods depending on the offset.
"""

if chi2_stat / orig_dof < 10:
# Use cumulative probability method for better accuracy
cp = chi2.sf(chi2_stat, orig_dof)
return chi2.isf(cp, target_dof)
# Use simple scaling for large values
return chi2_stat * target_dof / orig_dof


def _sample_reference_indices_torch(Nx, Ny, Ng, x_samples, y_samples):
"""
Helper function to sample references for GPU-based Torch computations.
Expand Down Expand Up @@ -149,7 +135,7 @@ def _sample_reference_indices_numpy(Nx, Ny, Ng, x_samples, y_samples):
return refs, x_samples, y_samples


def _compute_counts_torch(x_samples, y_samples, refs, num_refs):
def _compute_counts_torch(x_samples, y_samples, refs, num_refs, p=2.0):
"""
Helper function to calculate distances for GPU-based Torch computations.

Expand All @@ -176,8 +162,8 @@ def _compute_counts_torch(x_samples, y_samples, refs, num_refs):
"""

# Compute distances and find nearest references
distances_x = torch.cdist(x_samples, refs)
distances_y = torch.cdist(y_samples, refs)
distances_x = torch.cdist(x_samples, refs, p=p)
distances_y = torch.cdist(y_samples, refs, p=p)

idx_x = distances_x.argmin(dim=1)
idx_y = distances_y.argmin(dim=1)
Expand All @@ -188,7 +174,7 @@ def _compute_counts_torch(x_samples, y_samples, refs, num_refs):
return counts_x.cpu().numpy(), counts_y.cpu().numpy()


def _compute_counts_numpy(x_samples, y_samples, refs, num_refs):
def _compute_counts_numpy(x_samples, y_samples, refs, num_refs, kernel="euclidean"):
"""
Helper function to calculate distances for CPU-based NumPy computations.

Expand All @@ -207,6 +193,8 @@ def _compute_counts_numpy(x_samples, y_samples, refs, num_refs):
Number of reference samples used in the test.
num_refs : int
Number of reference samples to use.
kernel : str or callable
kernel function for distance calculations.

Returns
-------
Expand All @@ -215,8 +203,8 @@ def _compute_counts_numpy(x_samples, y_samples, refs, num_refs):
"""

# Compute distances
distances_x = cdist(x_samples, refs, metric="euclidean")
distances_y = cdist(y_samples, refs, metric="euclidean")
distances_x = cdist(x_samples, refs, metric=kernel)
distances_y = cdist(y_samples, refs, metric=kernel)

# Nearest references
idx_x = np.argmin(distances_x, axis=1)
Expand Down