From e4ba7d97d4c896357f6a863801cf6dfd765d9cb5 Mon Sep 17 00:00:00 2001 From: Joey Howlett Date: Thu, 6 May 2021 16:11:59 -0400 Subject: [PATCH 1/3] add p-value toyMC --- GOFevaluation/evaluators_nd.py | 74 +++++++++++++++++++++++++++++----- 1 file changed, 65 insertions(+), 9 deletions(-) diff --git a/GOFevaluation/evaluators_nd.py b/GOFevaluation/evaluators_nd.py index 5ca371c..f3ebfe0 100644 --- a/GOFevaluation/evaluators_nd.py +++ b/GOFevaluation/evaluators_nd.py @@ -11,7 +11,10 @@ class nd_test_statistics(test_statistics): """ def bin_data(self, bin_edges): # function to bin nD data: - self.binned_data, _ = np.histogramdd(self.data, bins=bin_edges) + if len(self.data.shape)==1: + self.binned_data, _ = np.histogram(self.data, bins=bin_edges) + else: + self.binned_data, _ = np.histogramdd(self.data, bins=bin_edges) class binned_poisson_chi2_gof(nd_test_statistics): @@ -20,26 +23,39 @@ class binned_poisson_chi2_gof(nd_test_statistics): In the limit of large bin counts (10+) this is Chi2 distributed. In the general case you may have to toyMC the distribution yourself. Input: - - nevents_expected: array with expected # of events (not rates) for each bin - data: array of equal shape as nevents_expected containing observed events in each bin + - pdf: normalized histogram of binned expectations + - bin_edges: bin-edges of the pdf + - nevents_expected: expectation, can be mean of expectation pdf Output: - the sum of the binned poisson Chi2. reference: https://doi.org/10.1016/0167-5087(84)90016-4 While the absolute likelihood is a poor GOF measure (see http://www.physics.ucla.edu/~cousins/stats/cousins_saturated.pdf) """ - def __minimalinit__(self, data, expectations): - # initialise with already binned data + expectations (typical case): + @classmethod + def from_binned(cls, data, expectations): + """Initialize with already binned data + expectations + + In this case the bin-edges don't matter, so we bypass the usual init + """ + self = cls(None, None, None, None) nd_test_statistics.__init__(self=self, data=data, pdf=expectations / np.sum(expectations), nevents_expected=np.sum(expectations)) self._name = self.__class__.__name__ self.binned_data = data + return self def __init__(self, data, pdf, bin_edges, nevents_expected): - #initialise with the common call signature + """Initialize with unbinned data and a normalized pdf + """ + if data is None: + # bypass init, using binned data + return + # initialise with the common call signature nd_test_statistics.__init__(self=self, data=data, pdf=pdf, @@ -48,9 +64,49 @@ def __init__(self, data, pdf, bin_edges, nevents_expected): self.bin_edges = bin_edges self.bin_data(bin_edges=bin_edges) + return - def calculate_gof(self): - ret = sps.poisson(self.binned_data).logpmf(self.binned_data) - ret -= sps.poisson(self.pdf * self.nevents_expected).logpmf( - self.binned_data) + @classmethod + def calculate_binned_gof(cls, binned_data, binned_expectations): + """Get Chi2 GoF from binned data & expectations + """ + ret = sps.poisson(binned_data).logpmf(binned_data) + ret -= sps.poisson(binned_expectations).logpmf(binned_data) return 2 * np.sum(ret) + + def calculate_gof(self): + """Get Chi2 GoF using current class attributes + """ + gof = binned_poisson_chi2_gof.calculate_binned_gof( + self.binned_data, + self.pdf * self.nevents_expected + ) + return gof + + def sample_gofs(self, n_mc=1000): + """Sample n_mc random Chi2 GoF's + + Simulates random data from the PDF and calculates its GoF n_mc times + """ + fake_gofs = np.zeros(n_mc) + for i in range(n_mc): + samples = sps.poisson(self.pdf * self.nevents_expected).rvs() + fake_gofs[i] = binned_poisson_chi2_gof.calculate_binned_gof( + samples, + self.pdf * self.nevents_expected, + ) + return fake_gofs + + def get_pvalue(self, n_mc=1000): + """Get the p-value of the data under the null hypothesis + + Gets the distribution of the GoF statistic, and compares it to the + GoF of the data given the expectations. + """ + gof = self.calculate_gof() + fake_gofs = self.sample_gofs(n_mc=n_mc) + hist, bin_edges = np.histogram(fake_gofs, bins=1000) + cumulative_density = 1.0 - np.cumsum(hist)/np.sum(hist) + bin_centers = np.mean(np.vstack([bin_edges[0:-1], bin_edges[1:]]), axis=0) + pvalue = cumulative_density[np.digitize(gof, bin_edges)-1] + return pvalue From 4955fa4b717025b62e7eb3fcdd6a57d516cb7044 Mon Sep 17 00:00:00 2001 From: Joey Howlett Date: Thu, 6 May 2021 16:37:23 -0400 Subject: [PATCH 2/3] raise better error --- GOFevaluation/evaluators_nd.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/GOFevaluation/evaluators_nd.py b/GOFevaluation/evaluators_nd.py index f3ebfe0..bd0226c 100644 --- a/GOFevaluation/evaluators_nd.py +++ b/GOFevaluation/evaluators_nd.py @@ -108,5 +108,8 @@ def get_pvalue(self, n_mc=1000): hist, bin_edges = np.histogram(fake_gofs, bins=1000) cumulative_density = 1.0 - np.cumsum(hist)/np.sum(hist) bin_centers = np.mean(np.vstack([bin_edges[0:-1], bin_edges[1:]]), axis=0) - pvalue = cumulative_density[np.digitize(gof, bin_edges)-1] + try: + pvalue = cumulative_density[np.digitize(gof, bin_edges)-1] + except IndexError: + raise ValueError('Not enough MC\'s run -- GoF is outside toy distribution!') return pvalue From 08b3ced9acf4ff9e0f30d4d532767b25a8db3040 Mon Sep 17 00:00:00 2001 From: Joey Howlett Date: Fri, 7 May 2021 08:52:10 -0400 Subject: [PATCH 3/3] move nD binning to parent class --- GOFevaluation/evaluators_nd.py | 18 +++--------------- GOFevaluation/test_statistics.py | 6 +++++- 2 files changed, 8 insertions(+), 16 deletions(-) diff --git a/GOFevaluation/evaluators_nd.py b/GOFevaluation/evaluators_nd.py index bd0226c..21f10a6 100644 --- a/GOFevaluation/evaluators_nd.py +++ b/GOFevaluation/evaluators_nd.py @@ -5,19 +5,7 @@ from GOFevaluation import test_statistics -class nd_test_statistics(test_statistics): - """ - Override binning to work in arbitrary dimensions - """ - def bin_data(self, bin_edges): - # function to bin nD data: - if len(self.data.shape)==1: - self.binned_data, _ = np.histogram(self.data, bins=bin_edges) - else: - self.binned_data, _ = np.histogramdd(self.data, bins=bin_edges) - - -class binned_poisson_chi2_gof(nd_test_statistics): +class binned_poisson_chi2_gof(test_statistics): """ computes the binned poisson modified Chi2 from Baker+Cousins In the limit of large bin counts (10+) this is Chi2 distributed. @@ -41,7 +29,7 @@ def from_binned(cls, data, expectations): In this case the bin-edges don't matter, so we bypass the usual init """ self = cls(None, None, None, None) - nd_test_statistics.__init__(self=self, + test_statistics.__init__(self=self, data=data, pdf=expectations / np.sum(expectations), nevents_expected=np.sum(expectations)) @@ -56,7 +44,7 @@ def __init__(self, data, pdf, bin_edges, nevents_expected): # bypass init, using binned data return # initialise with the common call signature - nd_test_statistics.__init__(self=self, + test_statistics.__init__(self=self, data=data, pdf=pdf, nevents_expected=nevents_expected) diff --git a/GOFevaluation/test_statistics.py b/GOFevaluation/test_statistics.py index 7233e3a..bcf966f 100644 --- a/GOFevaluation/test_statistics.py +++ b/GOFevaluation/test_statistics.py @@ -13,7 +13,11 @@ def calculate_gof(self): raise NotImplementedError("Your goodnes of fit computation goes here!") def bin_data(self, bin_edges): - self.binned_data, _ = np.histogram(self.data, bins=bin_edges) + # function to bin nD data: + if len(self.data.shape)==1: + self.binned_data, _ = np.histogram(self.data, bins=bin_edges) + else: + self.binned_data, _ = np.histogramdd(self.data, bins=bin_edges) def get_result_as_dict(self): assert self._name is not None, str(self.__class__.__name__) + ": You need to define self._name for your goodnes of fit measure!"