Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Calculate P-Value via ToyMC #2

Merged
merged 3 commits into from
May 7, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
87 changes: 67 additions & 20 deletions GOFevaluation/evaluators_nd.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,52 +5,99 @@
from GOFevaluation import test_statistics


class nd_test_statistics(test_statistics):
"""
Override binning to work in arbitrary dimensions
"""
def bin_data(self, bin_edges):
# function to bin nD data:
self.binned_data, _ = np.histogramdd(self.data, bins=bin_edges)


class binned_poisson_chi2_gof(nd_test_statistics):
class binned_poisson_chi2_gof(test_statistics):
"""
computes the binned poisson modified Chi2 from Baker+Cousins
In the limit of large bin counts (10+) this is Chi2 distributed.
In the general case you may have to toyMC the distribution yourself.
Input:
- nevents_expected: array with expected # of events (not rates) for each bin
- data: array of equal shape as nevents_expected
containing observed events in each bin
- pdf: normalized histogram of binned expectations
- bin_edges: bin-edges of the pdf
- nevents_expected: expectation, can be mean of expectation pdf
Output:
- the sum of the binned poisson Chi2.
reference: https://doi.org/10.1016/0167-5087(84)90016-4
While the absolute likelihood is a poor GOF measure
(see http://www.physics.ucla.edu/~cousins/stats/cousins_saturated.pdf)
"""
def __minimalinit__(self, data, expectations):
# initialise with already binned data + expectations (typical case):
nd_test_statistics.__init__(self=self,
@classmethod
def from_binned(cls, data, expectations):
"""Initialize with already binned data + expectations

In this case the bin-edges don't matter, so we bypass the usual init
"""
self = cls(None, None, None, None)
test_statistics.__init__(self=self,
data=data,
pdf=expectations / np.sum(expectations),
nevents_expected=np.sum(expectations))
self._name = self.__class__.__name__
self.binned_data = data
return self

def __init__(self, data, pdf, bin_edges, nevents_expected):
#initialise with the common call signature
nd_test_statistics.__init__(self=self,
"""Initialize with unbinned data and a normalized pdf
"""
if data is None:
# bypass init, using binned data
return
# initialise with the common call signature
test_statistics.__init__(self=self,
data=data,
pdf=pdf,
nevents_expected=nevents_expected)
self._name = self.__class__.__name__

self.bin_edges = bin_edges
self.bin_data(bin_edges=bin_edges)
return

def calculate_gof(self):
ret = sps.poisson(self.binned_data).logpmf(self.binned_data)
ret -= sps.poisson(self.pdf * self.nevents_expected).logpmf(
self.binned_data)
@classmethod
def calculate_binned_gof(cls, binned_data, binned_expectations):
"""Get Chi2 GoF from binned data & expectations
"""
ret = sps.poisson(binned_data).logpmf(binned_data)
ret -= sps.poisson(binned_expectations).logpmf(binned_data)
return 2 * np.sum(ret)

def calculate_gof(self):
"""Get Chi2 GoF using current class attributes
"""
gof = binned_poisson_chi2_gof.calculate_binned_gof(
self.binned_data,
self.pdf * self.nevents_expected
)
return gof

def sample_gofs(self, n_mc=1000):
"""Sample n_mc random Chi2 GoF's

Simulates random data from the PDF and calculates its GoF n_mc times
"""
fake_gofs = np.zeros(n_mc)
for i in range(n_mc):
samples = sps.poisson(self.pdf * self.nevents_expected).rvs()
fake_gofs[i] = binned_poisson_chi2_gof.calculate_binned_gof(
samples,
self.pdf * self.nevents_expected,
)
return fake_gofs

def get_pvalue(self, n_mc=1000):
"""Get the p-value of the data under the null hypothesis

Gets the distribution of the GoF statistic, and compares it to the
GoF of the data given the expectations.
"""
gof = self.calculate_gof()
fake_gofs = self.sample_gofs(n_mc=n_mc)
hist, bin_edges = np.histogram(fake_gofs, bins=1000)
cumulative_density = 1.0 - np.cumsum(hist)/np.sum(hist)
bin_centers = np.mean(np.vstack([bin_edges[0:-1], bin_edges[1:]]), axis=0)
try:
pvalue = cumulative_density[np.digitize(gof, bin_edges)-1]
except IndexError:
raise ValueError('Not enough MC\'s run -- GoF is outside toy distribution!')
return pvalue
6 changes: 5 additions & 1 deletion GOFevaluation/test_statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,11 @@ def calculate_gof(self):
raise NotImplementedError("Your goodnes of fit computation goes here!")

def bin_data(self, bin_edges):
self.binned_data, _ = np.histogram(self.data, bins=bin_edges)
# function to bin nD data:
if len(self.data.shape)==1:
self.binned_data, _ = np.histogram(self.data, bins=bin_edges)
else:
self.binned_data, _ = np.histogramdd(self.data, bins=bin_edges)

def get_result_as_dict(self):
assert self._name is not None, str(self.__class__.__name__) + ": You need to define self._name for your goodnes of fit measure!"
Expand Down