From 536fd758705d8ba69cfe68ba0fd0e986d7c5eecb Mon Sep 17 00:00:00 2001 From: Connor Stone Date: Wed, 20 Nov 2024 10:22:28 -0500 Subject: [PATCH] Add kernel option, simpler chi2 calculation --- src/pqm/pqm.py | 46 ++++++++++++++++++++++++++++++++++------------ src/pqm/utils.py | 28 ++++++++-------------------- 2 files changed, 42 insertions(+), 32 deletions(-) diff --git a/src/pqm/pqm.py b/src/pqm/pqm.py index 767dddc..bfd9740 100644 --- a/src/pqm/pqm.py +++ b/src/pqm/pqm.py @@ -1,9 +1,9 @@ -from typing import Optional, Union, Tuple +from typing import Optional, Union, Tuple, Callable import warnings import torch import numpy as np -from scipy.stats import chi2_contingency +from scipy.stats import chi2_contingency, chi2 from .utils import ( _mean_std_numpy, @@ -12,7 +12,6 @@ _sample_reference_indices_torch, _compute_counts_numpy, _compute_counts_torch, - _rescale_chi2, ) __all__ = ("pqm_chi2", "pqm_pvalue") @@ -25,6 +24,7 @@ def _pqm_test( z_score_norm: bool, x_frac: Optional[float], gauss_frac: float, + kernel: Union[str, Callable] = "euclidean", ) -> Tuple: """ Helper function to perform the PQM test and return the results from @@ -46,8 +46,8 @@ def _pqm_test( Number of times _pqm_test is called, re-tesselating the space. No re_tessellation if None (default). z_score_norm : bool - If True, z_score_norm the samples by subtracting the mean and dividing by the - standard deviation. mean and std are calculated from the combined + If True, z_score_norm the samples by subtracting the mean and dividing + by the standard deviation. mean and std are calculated from the combined x_samples and y_samples. x_frac : float Fraction of x_samples to use as reference samples. ``x_frac = 1`` will @@ -60,6 +60,12 @@ def _pqm_test( determined from the combined x_samples/y_samples. This ensures full support of the reference samples if pathological behavior is expected. Default: 0.0 no gaussian samples. + kernel : str or callable + Kernel function to use for distance calculation. If a string, must be + one of 'euclidean', 'cityblock', 'cosine', 'chebyshev', 'canberra', or + 'correlation' (see ``scipy.distance.cdist``). If a callable, must take + two vectors and return a scalar, should also be commutative. This only + works for numpy array inputs. Default: 'euclidean'. Note ---- @@ -143,7 +149,7 @@ def _pqm_test( refs, x_samples, y_samples = _sample_reference_indices_numpy( Nx, Ny, Ng, x_samples, y_samples ) - counts_x, counts_y = _compute_counts_numpy(x_samples, y_samples, refs, num_refs) + counts_x, counts_y = _compute_counts_numpy(x_samples, y_samples, refs, num_refs, kernel) # Remove references with no counts C = (counts_x > 0) | (counts_y > 0) @@ -183,6 +189,7 @@ def pqm_pvalue( z_score_norm: bool = False, x_frac: Optional[float] = None, gauss_frac: float = 0.0, + kernel: str = "euclidean", ): """ Perform the PQM test of the null hypothesis that `x_samples` and `y_samples` @@ -219,6 +226,12 @@ def pqm_pvalue( determined from the combined x_samples/y_samples. This ensures full support of the reference samples if pathological behavior is expected. Default: 0.0 no gaussian samples. + kernel : str or callable + Kernel function to use for distance calculation. If a string, must be + one of 'euclidean', 'cityblock', 'cosine', 'chebyshev', 'canberra', or + 'correlation' (see ``scipy.distance.cdist``). If a callable, must take + two vectors and return a scalar, should also be commutative. This only + works for numpy array inputs. Default: 'euclidean'. Note ---- @@ -250,11 +263,14 @@ def pqm_pvalue( z_score_norm=z_score_norm, x_frac=x_frac, gauss_frac=gauss_frac, + kernel=kernel, ) for _ in range(re_tessellation) ] - _, p_value, _, _ = _pqm_test(x_samples, y_samples, num_refs, z_score_norm, x_frac, gauss_frac) + _, p_value, _, _ = _pqm_test( + x_samples, y_samples, num_refs, z_score_norm, x_frac, gauss_frac, kernel + ) # Return p-value as a float return p_value @@ -268,6 +284,7 @@ def pqm_chi2( z_score_norm: bool = False, x_frac: Optional[float] = None, gauss_frac: float = 0.0, + kernel: str = "euclidean", ): """ Perform the PQM test of the null hypothesis that `x_samples` and `y_samples` @@ -304,6 +321,12 @@ def pqm_chi2( determined from the combined x_samples/y_samples. This ensures full support of the reference samples if pathological behavior is expected. Default: 0.0 no gaussian samples. + kernel : str or callable + Kernel function to use for distance calculation. If a string, must be + one of 'euclidean', 'cityblock', 'cosine', 'chebyshev', 'canberra', or + 'correlation' (see ``scipy.distance.cdist``). If a callable, must take + two vectors and return a scalar, should also be commutative. This only + works for numpy array inputs. Default: 'euclidean'. Note ---- @@ -344,17 +367,16 @@ def pqm_chi2( z_score_norm=z_score_norm, x_frac=x_frac, gauss_frac=gauss_frac, + kernel=kernel, ) for _ in range(re_tessellation) ] - chi2_stat, _, dof, _ = _pqm_test( - x_samples, y_samples, num_refs, z_score_norm, x_frac, gauss_frac + _, p_value, _, _ = _pqm_test( + x_samples, y_samples, num_refs, z_score_norm, x_frac, gauss_frac, kernel ) - # Rescale chi2 statistic if necessary - if dof != num_refs - 1: - chi2_stat = _rescale_chi2(chi2_stat, dof, num_refs - 1) + chi2_stat = chi2.isf(p_value, num_refs - 1) # Return chi2_stat as a float return chi2_stat diff --git a/src/pqm/utils.py b/src/pqm/utils.py index 372aa42..0b97c52 100644 --- a/src/pqm/utils.py +++ b/src/pqm/utils.py @@ -6,7 +6,6 @@ __all__ = ( "_mean_std_torch", "_mean_std_numpy", - "_rescale_chi2", "_chi2_contingency_torch", "_sample_reference_indices_numpy", "_compute_counts_numpy", @@ -51,19 +50,6 @@ def _mean_std_numpy(sample1, sample2): return m, s -def _rescale_chi2(chi2_stat, orig_dof, target_dof): - """ - Rescale chi2 statistic using appropriate methods depending on the offset. - """ - - if chi2_stat / orig_dof < 10: - # Use cumulative probability method for better accuracy - cp = chi2.sf(chi2_stat, orig_dof) - return chi2.isf(cp, target_dof) - # Use simple scaling for large values - return chi2_stat * target_dof / orig_dof - - def _sample_reference_indices_torch(Nx, Ny, Ng, x_samples, y_samples): """ Helper function to sample references for GPU-based Torch computations. @@ -149,7 +135,7 @@ def _sample_reference_indices_numpy(Nx, Ny, Ng, x_samples, y_samples): return refs, x_samples, y_samples -def _compute_counts_torch(x_samples, y_samples, refs, num_refs): +def _compute_counts_torch(x_samples, y_samples, refs, num_refs, p=2.0): """ Helper function to calculate distances for GPU-based Torch computations. @@ -176,8 +162,8 @@ def _compute_counts_torch(x_samples, y_samples, refs, num_refs): """ # Compute distances and find nearest references - distances_x = torch.cdist(x_samples, refs) - distances_y = torch.cdist(y_samples, refs) + distances_x = torch.cdist(x_samples, refs, p=p) + distances_y = torch.cdist(y_samples, refs, p=p) idx_x = distances_x.argmin(dim=1) idx_y = distances_y.argmin(dim=1) @@ -188,7 +174,7 @@ def _compute_counts_torch(x_samples, y_samples, refs, num_refs): return counts_x.cpu().numpy(), counts_y.cpu().numpy() -def _compute_counts_numpy(x_samples, y_samples, refs, num_refs): +def _compute_counts_numpy(x_samples, y_samples, refs, num_refs, kernel="euclidean"): """ Helper function to calculate distances for CPU-based NumPy computations. @@ -207,6 +193,8 @@ def _compute_counts_numpy(x_samples, y_samples, refs, num_refs): Number of reference samples used in the test. num_refs : int Number of reference samples to use. + kernel : str or callable + kernel function for distance calculations. Returns ------- @@ -215,8 +203,8 @@ def _compute_counts_numpy(x_samples, y_samples, refs, num_refs): """ # Compute distances - distances_x = cdist(x_samples, refs, metric="euclidean") - distances_y = cdist(y_samples, refs, metric="euclidean") + distances_x = cdist(x_samples, refs, metric=kernel) + distances_y = cdist(y_samples, refs, metric=kernel) # Nearest references idx_x = np.argmin(distances_x, axis=1)