From 536fd758705d8ba69cfe68ba0fd0e986d7c5eecb Mon Sep 17 00:00:00 2001
From: Connor Stone <connorstone628@gmail.com>
Date: Wed, 20 Nov 2024 10:22:28 -0500
Subject: [PATCH] Add kernel option, simpler chi2 calculation

---
 src/pqm/pqm.py   | 46 ++++++++++++++++++++++++++++++++++------------
 src/pqm/utils.py | 28 ++++++++--------------------
 2 files changed, 42 insertions(+), 32 deletions(-)

diff --git a/src/pqm/pqm.py b/src/pqm/pqm.py
index 767dddc..bfd9740 100644
--- a/src/pqm/pqm.py
+++ b/src/pqm/pqm.py
@@ -1,9 +1,9 @@
-from typing import Optional, Union, Tuple
+from typing import Optional, Union, Tuple, Callable
 import warnings
 
 import torch
 import numpy as np
-from scipy.stats import chi2_contingency
+from scipy.stats import chi2_contingency, chi2
 
 from .utils import (
     _mean_std_numpy,
@@ -12,7 +12,6 @@
     _sample_reference_indices_torch,
     _compute_counts_numpy,
     _compute_counts_torch,
-    _rescale_chi2,
 )
 
 __all__ = ("pqm_chi2", "pqm_pvalue")
@@ -25,6 +24,7 @@ def _pqm_test(
     z_score_norm: bool,
     x_frac: Optional[float],
     gauss_frac: float,
+    kernel: Union[str, Callable] = "euclidean",
 ) -> Tuple:
     """
     Helper function to perform the PQM test and return the results from
@@ -46,8 +46,8 @@ def _pqm_test(
         Number of times _pqm_test is called, re-tesselating the space. No
         re_tessellation if None (default).
     z_score_norm : bool
-        If True, z_score_norm the samples by subtracting the mean and dividing by the
-        standard deviation. mean and std are calculated from the combined
+        If True, z_score_norm the samples by subtracting the mean and dividing
+        by the standard deviation. mean and std are calculated from the combined
         x_samples and y_samples.
     x_frac : float
         Fraction of x_samples to use as reference samples. ``x_frac = 1`` will
@@ -60,6 +60,12 @@ def _pqm_test(
         determined from the combined x_samples/y_samples. This ensures full
         support of the reference samples if pathological behavior is expected.
         Default: 0.0 no gaussian samples.
+    kernel : str or callable
+        Kernel function to use for distance calculation. If a string, must be
+        one of 'euclidean', 'cityblock', 'cosine', 'chebyshev', 'canberra', or
+        'correlation' (see ``scipy.distance.cdist``). If a callable, must take
+        two vectors and return a scalar, should also be commutative. This only
+        works for numpy array inputs. Default: 'euclidean'.
 
     Note
     ----
@@ -143,7 +149,7 @@ def _pqm_test(
         refs, x_samples, y_samples = _sample_reference_indices_numpy(
             Nx, Ny, Ng, x_samples, y_samples
         )
-        counts_x, counts_y = _compute_counts_numpy(x_samples, y_samples, refs, num_refs)
+        counts_x, counts_y = _compute_counts_numpy(x_samples, y_samples, refs, num_refs, kernel)
 
     # Remove references with no counts
     C = (counts_x > 0) | (counts_y > 0)
@@ -183,6 +189,7 @@ def pqm_pvalue(
     z_score_norm: bool = False,
     x_frac: Optional[float] = None,
     gauss_frac: float = 0.0,
+    kernel: str = "euclidean",
 ):
     """
     Perform the PQM test of the null hypothesis that `x_samples` and `y_samples`
@@ -219,6 +226,12 @@ def pqm_pvalue(
         determined from the combined x_samples/y_samples. This ensures full
         support of the reference samples if pathological behavior is expected.
         Default: 0.0 no gaussian samples.
+    kernel : str or callable
+        Kernel function to use for distance calculation. If a string, must be
+        one of 'euclidean', 'cityblock', 'cosine', 'chebyshev', 'canberra', or
+        'correlation' (see ``scipy.distance.cdist``). If a callable, must take
+        two vectors and return a scalar, should also be commutative. This only
+        works for numpy array inputs. Default: 'euclidean'.
 
     Note
     ----
@@ -250,11 +263,14 @@ def pqm_pvalue(
                 z_score_norm=z_score_norm,
                 x_frac=x_frac,
                 gauss_frac=gauss_frac,
+                kernel=kernel,
             )
             for _ in range(re_tessellation)
         ]
 
-    _, p_value, _, _ = _pqm_test(x_samples, y_samples, num_refs, z_score_norm, x_frac, gauss_frac)
+    _, p_value, _, _ = _pqm_test(
+        x_samples, y_samples, num_refs, z_score_norm, x_frac, gauss_frac, kernel
+    )
 
     # Return p-value as a float
     return p_value
@@ -268,6 +284,7 @@ def pqm_chi2(
     z_score_norm: bool = False,
     x_frac: Optional[float] = None,
     gauss_frac: float = 0.0,
+    kernel: str = "euclidean",
 ):
     """
     Perform the PQM test of the null hypothesis that `x_samples` and `y_samples`
@@ -304,6 +321,12 @@ def pqm_chi2(
         determined from the combined x_samples/y_samples. This ensures full
         support of the reference samples if pathological behavior is expected.
         Default: 0.0 no gaussian samples.
+    kernel : str or callable
+        Kernel function to use for distance calculation. If a string, must be
+        one of 'euclidean', 'cityblock', 'cosine', 'chebyshev', 'canberra', or
+        'correlation' (see ``scipy.distance.cdist``). If a callable, must take
+        two vectors and return a scalar, should also be commutative. This only
+        works for numpy array inputs. Default: 'euclidean'.
 
     Note
     ----
@@ -344,17 +367,16 @@ def pqm_chi2(
                 z_score_norm=z_score_norm,
                 x_frac=x_frac,
                 gauss_frac=gauss_frac,
+                kernel=kernel,
             )
             for _ in range(re_tessellation)
         ]
 
-    chi2_stat, _, dof, _ = _pqm_test(
-        x_samples, y_samples, num_refs, z_score_norm, x_frac, gauss_frac
+    _, p_value, _, _ = _pqm_test(
+        x_samples, y_samples, num_refs, z_score_norm, x_frac, gauss_frac, kernel
     )
 
-    # Rescale chi2 statistic if necessary
-    if dof != num_refs - 1:
-        chi2_stat = _rescale_chi2(chi2_stat, dof, num_refs - 1)
+    chi2_stat = chi2.isf(p_value, num_refs - 1)
 
     # Return chi2_stat as a float
     return chi2_stat
diff --git a/src/pqm/utils.py b/src/pqm/utils.py
index 372aa42..0b97c52 100644
--- a/src/pqm/utils.py
+++ b/src/pqm/utils.py
@@ -6,7 +6,6 @@
 __all__ = (
     "_mean_std_torch",
     "_mean_std_numpy",
-    "_rescale_chi2",
     "_chi2_contingency_torch",
     "_sample_reference_indices_numpy",
     "_compute_counts_numpy",
@@ -51,19 +50,6 @@ def _mean_std_numpy(sample1, sample2):
     return m, s
 
 
-def _rescale_chi2(chi2_stat, orig_dof, target_dof):
-    """
-    Rescale chi2 statistic using appropriate methods depending on the offset.
-    """
-
-    if chi2_stat / orig_dof < 10:
-        # Use cumulative probability method for better accuracy
-        cp = chi2.sf(chi2_stat, orig_dof)
-        return chi2.isf(cp, target_dof)
-    # Use simple scaling for large values
-    return chi2_stat * target_dof / orig_dof
-
-
 def _sample_reference_indices_torch(Nx, Ny, Ng, x_samples, y_samples):
     """
     Helper function to sample references for GPU-based Torch computations.
@@ -149,7 +135,7 @@ def _sample_reference_indices_numpy(Nx, Ny, Ng, x_samples, y_samples):
     return refs, x_samples, y_samples
 
 
-def _compute_counts_torch(x_samples, y_samples, refs, num_refs):
+def _compute_counts_torch(x_samples, y_samples, refs, num_refs, p=2.0):
     """
     Helper function to calculate distances for GPU-based Torch computations.
 
@@ -176,8 +162,8 @@ def _compute_counts_torch(x_samples, y_samples, refs, num_refs):
     """
 
     # Compute distances and find nearest references
-    distances_x = torch.cdist(x_samples, refs)
-    distances_y = torch.cdist(y_samples, refs)
+    distances_x = torch.cdist(x_samples, refs, p=p)
+    distances_y = torch.cdist(y_samples, refs, p=p)
 
     idx_x = distances_x.argmin(dim=1)
     idx_y = distances_y.argmin(dim=1)
@@ -188,7 +174,7 @@ def _compute_counts_torch(x_samples, y_samples, refs, num_refs):
     return counts_x.cpu().numpy(), counts_y.cpu().numpy()
 
 
-def _compute_counts_numpy(x_samples, y_samples, refs, num_refs):
+def _compute_counts_numpy(x_samples, y_samples, refs, num_refs, kernel="euclidean"):
     """
     Helper function to calculate distances for CPU-based NumPy computations.
 
@@ -207,6 +193,8 @@ def _compute_counts_numpy(x_samples, y_samples, refs, num_refs):
         Number of reference samples used in the test.
     num_refs : int
         Number of reference samples to use.
+    kernel : str or callable
+        kernel function for distance calculations.
 
     Returns
     -------
@@ -215,8 +203,8 @@ def _compute_counts_numpy(x_samples, y_samples, refs, num_refs):
     """
 
     # Compute distances
-    distances_x = cdist(x_samples, refs, metric="euclidean")
-    distances_y = cdist(y_samples, refs, metric="euclidean")
+    distances_x = cdist(x_samples, refs, metric=kernel)
+    distances_y = cdist(y_samples, refs, metric=kernel)
 
     # Nearest references
     idx_x = np.argmin(distances_x, axis=1)