From e4c40beed82930aa88f7e05eb849b8315595ccf8 Mon Sep 17 00:00:00 2001 From: Connor Stone Date: Wed, 14 Aug 2024 09:30:16 -0400 Subject: [PATCH 1/3] ref samples now distributed from each set by multinomial --- src/pqm/pqm.py | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/src/pqm/pqm.py b/src/pqm/pqm.py index 8ef12bd..e8911da 100644 --- a/src/pqm/pqm.py +++ b/src/pqm/pqm.py @@ -1,4 +1,5 @@ from typing import Optional +import warnings import numpy as np from scipy.stats import chi2_contingency, chi2 @@ -65,11 +66,16 @@ def _pqm_test( if x_frac is None: x_frac = len(x_samples) / (len(x_samples) + len(y_samples)) + # Determine number of samples from each distribution (x_samples, y_samples, gaussian) + Nx, Ny, Ng = np.random.multinomial( + num_refs, + [x_frac * (1.0 - gauss_frac), (1.0 - x_frac) * (1.0 - gauss_frac), gauss_frac], + ) + print(Nx, Ny, Ng) + # Collect reference samples from x_samples if x_frac > 0: - xrefs = np.random.choice( - len(x_samples), int(x_frac * (1.0 - gauss_frac) * num_refs), replace=False - ) + xrefs = np.random.choice(len(x_samples), Nx, replace=False) N = np.arange(len(x_samples)) N[xrefs] = -1 N = N[N >= 0] @@ -79,9 +85,7 @@ def _pqm_test( # Collect reference samples from y_samples if x_frac < 1: - yrefs = np.random.choice( - len(y_samples), int((1.0 - x_frac) * (1.0 - gauss_frac) * num_refs), replace=False - ) + yrefs = np.random.choice(len(y_samples), Ny, replace=False) N = np.arange(len(y_samples)) N[yrefs] = -1 N = N[N >= 0] @@ -93,11 +97,19 @@ def _pqm_test( refs = np.concatenate([xrefs, yrefs], axis=0) # get gaussian reference points if requested - if gauss_frac > 0: + if Ng > 0: + if Nx + Ny > 2: + m, s = np.mean(refs, axis=0), np.std(refs, axis=0) + else: + warnings.warn( + f"Very low number of x/y reference samples used ({Nx+Ny}). Initializing gaussian from y_samples.", + UserWarning, + ) + m, s = np.mean(y_samples, axis=0), np.std(y_samples, axis=0) gauss_refs = np.random.normal( - loc=np.mean(refs, axis=0), - scale=np.std(refs, axis=0), - size=(int(gauss_frac * num_refs), *refs.shape[1:]), + loc=m, + scale=s, + size=(Ng, *x_samples.shape[1:]), ) refs = np.concatenate([refs, gauss_refs], axis=0) From 9bef103b58cc7b20a81e167c04e7df86d8149dab Mon Sep 17 00:00:00 2001 From: Connor Stone Date: Wed, 14 Aug 2024 09:56:51 -0400 Subject: [PATCH 2/3] Improve docstrings --- src/pqm/pqm.py | 64 +++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 53 insertions(+), 11 deletions(-) diff --git a/src/pqm/pqm.py b/src/pqm/pqm.py index e8911da..d16bcb2 100644 --- a/src/pqm/pqm.py +++ b/src/pqm/pqm.py @@ -40,8 +40,18 @@ def _pqm_test( gauss_frac : float Fraction of samples to take from gaussian distribution with mean/std determined from the other reference samples. This ensures full support - of the reference samples if pathological behavior is expected. - Default: 0.0 no gaussian samples. + of the reference samples if pathological behavior is expected. Default: + 0.0 no gaussian samples. + + Note + ---- + When using ``x_frac`` and ``gauss_frac``, note that the number of + reference samples from the x_samples, y_samples, and gaussian + distribution will be determined by a multinomial distribution. This + means that the actual number of reference samples from each distribution + may not be exactly equal to the requested fractions, but will on average + equal those numbers. For best results, we suggest using a large number + of re-tessellations, though this is our recommendation in any case. Returns ------- @@ -71,7 +81,6 @@ def _pqm_test( num_refs, [x_frac * (1.0 - gauss_frac), (1.0 - x_frac) * (1.0 - gauss_frac), gauss_frac], ) - print(Nx, Ny, Ng) # Collect reference samples from x_samples if x_frac > 0: @@ -102,7 +111,7 @@ def _pqm_test( m, s = np.mean(refs, axis=0), np.std(refs, axis=0) else: warnings.warn( - f"Very low number of x/y reference samples used ({Nx+Ny}). Initializing gaussian from y_samples.", + f"Very low number of x/y reference samples used ({Nx+Ny}). Initializing gaussian from all y_samples. We suggest increasing num_refs or decreasing gauss_frac.", UserWarning, ) m, s = np.mean(y_samples, axis=0), np.std(y_samples, axis=0) @@ -140,7 +149,8 @@ def pqm_pvalue( ): """ Perform the PQM test of the null hypothesis that `x_samples` and `y_samples` - are drawn form the same distribution. + are drawn from the same distribution. This version returns the pvalue under + the null hypothesis that both samples are drawn from the same distribution. Parameters ---------- @@ -170,8 +180,18 @@ def pqm_pvalue( gauss_frac : float Fraction of samples to take from gaussian distribution with mean/std determined from the other reference samples. This ensures full support - of the reference samples if pathological behavior is expected. - Default: 0.0 no gaussian samples + of the reference samples if pathological behavior is expected. Default: + 0.0 no gaussian samples. + + Note + ---- + When using ``x_frac`` and ``gauss_frac``, note that the number of + reference samples from the x_samples, y_samples, and gaussian + distribution will be determined by a multinomial distribution. This + means that the actual number of reference samples from each distribution + may not be exactly equal to the requested fractions, but will on average + equal those numbers. For best results, we suggest using a large number + of re-tessellations, though this is our recommendation in any case. Returns ------- @@ -206,7 +226,8 @@ def pqm_chi2( ): """ Perform the PQM test of the null hypothesis that `x_samples` and `y_samples` - are drawn form the same distribution. + are drawn from the same distribution. This version returns the chi^2 + statistic with dof = num_refs-1. Parameters ---------- @@ -236,13 +257,34 @@ def pqm_chi2( gauss_frac : float Fraction of samples to take from gaussian distribution with mean/std determined from the other reference samples. This ensures full support - of the reference samples if pathological behavior is expected. - Default: 0.0 no gaussian samples + of the reference samples if pathological behavior is expected. Default: + 0.0 no gaussian samples. + + Note + ---- + When using ``x_frac`` and ``gauss_frac``, note that the number of + reference samples from the x_samples, y_samples, and gaussian + distribution will be determined by a multinomial distribution. This + means that the actual number of reference samples from each distribution + may not be exactly equal to the requested fractions, but will on average + equal those numbers. For best results, we suggest using a large number + of re-tessellations, though this is our recommendation in any case. + + Note + ---- + Some voronoi bins may be empty after counting. Due to the nature of + ``scipy.stats.chi2_contingency`` we must first remove those voronoi + cells, meaning that the dof of the chi^2 would change. To mitigate this + effect, we rescale the chi^2 statistic to have the same cumulative + probability as the desired chi^2 statistic. Thus, the returned chi^2 is + always in reference to dof = num_refs-1. Thus users should not need to + worry about this, but it is worth noting, please contact us if you + notice unusual behavior. Returns ------- float or list - chi2 statistic(s) and degree(s) of freedom. + chi2 statistic(s). """ if re_tessellation is not None: return [ From 574651a33428c880b849c85ea71572027639a730 Mon Sep 17 00:00:00 2001 From: Connor Stone Date: Wed, 14 Aug 2024 10:23:45 -0400 Subject: [PATCH 3/3] Tweak x/y sampling condition --- src/pqm/pqm.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/pqm/pqm.py b/src/pqm/pqm.py index d16bcb2..c016a47 100644 --- a/src/pqm/pqm.py +++ b/src/pqm/pqm.py @@ -83,7 +83,7 @@ def _pqm_test( ) # Collect reference samples from x_samples - if x_frac > 0: + if Nx > 0: xrefs = np.random.choice(len(x_samples), Nx, replace=False) N = np.arange(len(x_samples)) N[xrefs] = -1 @@ -93,7 +93,7 @@ def _pqm_test( xrefs = np.zeros((0,) + x_samples.shape[1:]) # Collect reference samples from y_samples - if x_frac < 1: + if Ny > 0: yrefs = np.random.choice(len(y_samples), Ny, replace=False) N = np.arange(len(y_samples)) N[yrefs] = -1