From e4c40beed82930aa88f7e05eb849b8315595ccf8 Mon Sep 17 00:00:00 2001
From: Connor Stone <connorstone628@gmail.com>
Date: Wed, 14 Aug 2024 09:30:16 -0400
Subject: [PATCH 1/3] ref samples now distributed from each set by multinomial

---
 src/pqm/pqm.py | 32 ++++++++++++++++++++++----------
 1 file changed, 22 insertions(+), 10 deletions(-)

diff --git a/src/pqm/pqm.py b/src/pqm/pqm.py
index 8ef12bd..e8911da 100644
--- a/src/pqm/pqm.py
+++ b/src/pqm/pqm.py
@@ -1,4 +1,5 @@
 from typing import Optional
+import warnings
 
 import numpy as np
 from scipy.stats import chi2_contingency, chi2
@@ -65,11 +66,16 @@ def _pqm_test(
     if x_frac is None:
         x_frac = len(x_samples) / (len(x_samples) + len(y_samples))
 
+    # Determine number of samples from each distribution (x_samples, y_samples, gaussian)
+    Nx, Ny, Ng = np.random.multinomial(
+        num_refs,
+        [x_frac * (1.0 - gauss_frac), (1.0 - x_frac) * (1.0 - gauss_frac), gauss_frac],
+    )
+    print(Nx, Ny, Ng)
+
     # Collect reference samples from x_samples
     if x_frac > 0:
-        xrefs = np.random.choice(
-            len(x_samples), int(x_frac * (1.0 - gauss_frac) * num_refs), replace=False
-        )
+        xrefs = np.random.choice(len(x_samples), Nx, replace=False)
         N = np.arange(len(x_samples))
         N[xrefs] = -1
         N = N[N >= 0]
@@ -79,9 +85,7 @@ def _pqm_test(
 
     # Collect reference samples from y_samples
     if x_frac < 1:
-        yrefs = np.random.choice(
-            len(y_samples), int((1.0 - x_frac) * (1.0 - gauss_frac) * num_refs), replace=False
-        )
+        yrefs = np.random.choice(len(y_samples), Ny, replace=False)
         N = np.arange(len(y_samples))
         N[yrefs] = -1
         N = N[N >= 0]
@@ -93,11 +97,19 @@ def _pqm_test(
     refs = np.concatenate([xrefs, yrefs], axis=0)
 
     # get gaussian reference points if requested
-    if gauss_frac > 0:
+    if Ng > 0:
+        if Nx + Ny > 2:
+            m, s = np.mean(refs, axis=0), np.std(refs, axis=0)
+        else:
+            warnings.warn(
+                f"Very low number of x/y reference samples used ({Nx+Ny}). Initializing gaussian from y_samples.",
+                UserWarning,
+            )
+            m, s = np.mean(y_samples, axis=0), np.std(y_samples, axis=0)
         gauss_refs = np.random.normal(
-            loc=np.mean(refs, axis=0),
-            scale=np.std(refs, axis=0),
-            size=(int(gauss_frac * num_refs), *refs.shape[1:]),
+            loc=m,
+            scale=s,
+            size=(Ng, *x_samples.shape[1:]),
         )
         refs = np.concatenate([refs, gauss_refs], axis=0)
 

From 9bef103b58cc7b20a81e167c04e7df86d8149dab Mon Sep 17 00:00:00 2001
From: Connor Stone <connorstone628@gmail.com>
Date: Wed, 14 Aug 2024 09:56:51 -0400
Subject: [PATCH 2/3] Improve docstrings

---
 src/pqm/pqm.py | 64 +++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 53 insertions(+), 11 deletions(-)

diff --git a/src/pqm/pqm.py b/src/pqm/pqm.py
index e8911da..d16bcb2 100644
--- a/src/pqm/pqm.py
+++ b/src/pqm/pqm.py
@@ -40,8 +40,18 @@ def _pqm_test(
     gauss_frac : float
         Fraction of samples to take from gaussian distribution with mean/std
         determined from the other reference samples. This ensures full support
-        of the reference samples if pathological behavior is expected.
-        Default: 0.0 no gaussian samples.
+        of the reference samples if pathological behavior is expected. Default:
+        0.0 no gaussian samples.
+
+    Note
+    ----
+        When using ``x_frac`` and ``gauss_frac``, note that the number of
+        reference samples from the x_samples, y_samples, and gaussian
+        distribution will be determined by a multinomial distribution. This
+        means that the actual number of reference samples from each distribution
+        may not be exactly equal to the requested fractions, but will on average
+        equal those numbers. For best results, we suggest using a large number
+        of re-tessellations, though this is our recommendation in any case.
 
     Returns
     -------
@@ -71,7 +81,6 @@ def _pqm_test(
         num_refs,
         [x_frac * (1.0 - gauss_frac), (1.0 - x_frac) * (1.0 - gauss_frac), gauss_frac],
     )
-    print(Nx, Ny, Ng)
 
     # Collect reference samples from x_samples
     if x_frac > 0:
@@ -102,7 +111,7 @@ def _pqm_test(
             m, s = np.mean(refs, axis=0), np.std(refs, axis=0)
         else:
             warnings.warn(
-                f"Very low number of x/y reference samples used ({Nx+Ny}). Initializing gaussian from y_samples.",
+                f"Very low number of x/y reference samples used ({Nx+Ny}). Initializing gaussian from all y_samples. We suggest increasing num_refs or decreasing gauss_frac.",
                 UserWarning,
             )
             m, s = np.mean(y_samples, axis=0), np.std(y_samples, axis=0)
@@ -140,7 +149,8 @@ def pqm_pvalue(
 ):
     """
     Perform the PQM test of the null hypothesis that `x_samples` and `y_samples`
-    are drawn form the same distribution.
+    are drawn from the same distribution. This version returns the pvalue under
+    the null hypothesis that both samples are drawn from the same distribution.
 
     Parameters
     ----------
@@ -170,8 +180,18 @@ def pqm_pvalue(
     gauss_frac : float
         Fraction of samples to take from gaussian distribution with mean/std
         determined from the other reference samples. This ensures full support
-        of the reference samples if pathological behavior is expected.
-        Default: 0.0 no gaussian samples
+        of the reference samples if pathological behavior is expected. Default:
+        0.0 no gaussian samples.
+
+    Note
+    ----
+        When using ``x_frac`` and ``gauss_frac``, note that the number of
+        reference samples from the x_samples, y_samples, and gaussian
+        distribution will be determined by a multinomial distribution. This
+        means that the actual number of reference samples from each distribution
+        may not be exactly equal to the requested fractions, but will on average
+        equal those numbers. For best results, we suggest using a large number
+        of re-tessellations, though this is our recommendation in any case.
 
     Returns
     -------
@@ -206,7 +226,8 @@ def pqm_chi2(
 ):
     """
     Perform the PQM test of the null hypothesis that `x_samples` and `y_samples`
-    are drawn form the same distribution.
+    are drawn from the same distribution. This version returns the chi^2
+    statistic with dof = num_refs-1.
 
     Parameters
     ----------
@@ -236,13 +257,34 @@ def pqm_chi2(
     gauss_frac : float
         Fraction of samples to take from gaussian distribution with mean/std
         determined from the other reference samples. This ensures full support
-        of the reference samples if pathological behavior is expected.
-        Default: 0.0 no gaussian samples
+        of the reference samples if pathological behavior is expected. Default:
+        0.0 no gaussian samples.
+
+    Note
+    ----
+        When using ``x_frac`` and ``gauss_frac``, note that the number of
+        reference samples from the x_samples, y_samples, and gaussian
+        distribution will be determined by a multinomial distribution. This
+        means that the actual number of reference samples from each distribution
+        may not be exactly equal to the requested fractions, but will on average
+        equal those numbers. For best results, we suggest using a large number
+        of re-tessellations, though this is our recommendation in any case.
+
+    Note
+    ----
+        Some voronoi bins may be empty after counting. Due to the nature of
+        ``scipy.stats.chi2_contingency`` we must first remove those voronoi
+        cells, meaning that the dof of the chi^2 would change. To mitigate this
+        effect, we rescale the chi^2 statistic to have the same cumulative
+        probability as the desired chi^2 statistic. Thus, the returned chi^2 is
+        always in reference to dof = num_refs-1. Thus users should not need to
+        worry about this, but it is worth noting, please contact us if you
+        notice unusual behavior.
 
     Returns
     -------
     float or list
-        chi2 statistic(s) and degree(s) of freedom.
+        chi2 statistic(s).
     """
     if re_tessellation is not None:
         return [

From 574651a33428c880b849c85ea71572027639a730 Mon Sep 17 00:00:00 2001
From: Connor Stone <connorstone628@gmail.com>
Date: Wed, 14 Aug 2024 10:23:45 -0400
Subject: [PATCH 3/3] Tweak x/y sampling condition

---
 src/pqm/pqm.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/pqm/pqm.py b/src/pqm/pqm.py
index d16bcb2..c016a47 100644
--- a/src/pqm/pqm.py
+++ b/src/pqm/pqm.py
@@ -83,7 +83,7 @@ def _pqm_test(
     )
 
     # Collect reference samples from x_samples
-    if x_frac > 0:
+    if Nx > 0:
         xrefs = np.random.choice(len(x_samples), Nx, replace=False)
         N = np.arange(len(x_samples))
         N[xrefs] = -1
@@ -93,7 +93,7 @@ def _pqm_test(
         xrefs = np.zeros((0,) + x_samples.shape[1:])
 
     # Collect reference samples from y_samples
-    if x_frac < 1:
+    if Ny > 0:
         yrefs = np.random.choice(len(y_samples), Ny, replace=False)
         N = np.arange(len(y_samples))
         N[yrefs] = -1