-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathindividual_metrics.py
364 lines (330 loc) · 13.1 KB
/
individual_metrics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
from __future__ import division
import numpy as np
import qp
import matplotlib.pyplot as plt
from scipy import stats
import skgof
class EvaluateMetric(object):
def __init__(self,ensemble_obj,truths):
"""an object that takes a qp Ensemble of N PDF objects and an array of
N "truth" specz's, will be used to calculate PIT and QQ, plus more stuff
later
Parameters:
ensemble_obj: qp ensemble
a qp ensemble object containing the N PDFs
truths: numpy array
1D numpy array with the N spec-z values
self.pitarray stores the PIT values once they are computed, as they are
used by multiple metrics, so there's no need to compute them twice
"""
# if ensemble_obj==None or truths==None:
# print 'Warning: inputs not complete'
self.ensemble_obj = ensemble_obj
self.truths = truths
self.pitarray = None #will store once computed as used often
def PIT(self,using='gridded',dx=0.0001):
""" computes the Probability Integral Transform (PIT), described in
Tanaka et al. 2017(ArXiv 1704.05988), which is the integral of the
PDF from 0 to zref.
Parameters:
using: string
which parameterization to evaluate
dx: float
step size used in integral
Returns
-------
ndarray
The values of the PIT for each ensemble object
Also stores PIT array in self.pitarray
"""
if len(self.truths) != self.ensemble_obj.n_pdfs:
print 'Warning: number of zref values not equal to number of ensemble objects'
return
n = self.ensemble_obj.n_pdfs
pitlimits = np.zeros([n,2])
pitlimits[:,1] = self.truths
tmppit = self.ensemble_obj.integrate(limits=pitlimits,using=using,dx=dx)
self.pitarray = np.array(tmppit)
return tmppit
def QQvectors(self,using,dx=0.0001,Nquants=101):
"""Returns quantile quantile vectors for the ensemble using the PIT values,
without actually plotting them. Will be useful in making multi-panel plots
simply take the percentiles of the values in order to get the Qdata
quantiles
Parameters:
using: string
which parameterization to evaluate
dx: float
step size for integral
Nquants: int
the number of quantile bins to compute, default 100
Returns
-------
numpy arrays for Qtheory and Qdata
"""
if self.pitarray is not None:
pits = np.array(self.pitarray)
else:
pits = self.PIT(using=using,dx=dx)
self.pitarray = pits
quants = np.linspace(0.,100.,Nquants)
Qtheory = quants/100.
Qdata = np.percentile(pits,quants)
return Qtheory, Qdata
def QQplot(self,using,dx=0.0001,Nquants=101):
"""Quantile quantile plot for the ensemble using the PIT values,
simply take the percentiles of the values in order to get the Qdata
quantiles
Parameters:
using: string
which parameterization to evaluate
dx: float
step size for integral
Nquants: int
the number of quantile bins to compute, default 100
Returns
-------
matplotlib plot of the quantiles
"""
if self.pitarray is not None:
pits = np.array(self.pitarray)
else:
pits = self.PIT(using=using,dx=dx)
self.pitarray = pits
quants = np.linspace(0.,100.,Nquants)
QTheory = quants/100.
Qdata = np.percentile(pits,quants)
plt.figure(figsize=(10,10))
plt.plot(QTheory,Qdata,c='b',linestyle='-',linewidth=3,label='QQ')
plt.plot([0,1],[0,1],color='k',linestyle='-',linewidth=2)
plt.xlabel("Qtheory",fontsize=18)
plt.ylabel("Qdata",fontsize=18)
plt.legend()
plt.savefig("QQplot.jpg")
return
def KS(self, using, dx=0.0001):
"""
Compute the Kolmogorov-Smirnov statistic and p-value for the PIT
values by comparing with a uniform distribution between 0 and 1.
Parameters:
-----------
using: string
which parameterization to evaluate
dx: float
step size for integral
Returns:
--------
KS statistic and pvalue
"""
if self.pitarray is not None:
pits = np.array(self.pitarray)
else:
pits = np.array(self.PIT(using=using,dx=dx))
self.pitarray = pits
ks_result = skgof.ks_test(pits, stats.uniform())
return ks_result.statistic, ks_result.pvalue
def CvM(self, using, dx=0.0001):
"""
Compute the Cramer-von Mises statistic and p-value for the PIT values
by comparing with a uniform distribution between 0 and 1.
Parameters:
-----------
using: string
which parameterization to evaluate
dx: float
step size for integral
Returns:
--------
CvM statistic and pvalue
"""
if self.pitarray is not None:
pits = np.array(self.pitarray)
else:
pits = np.array(self.PIT(using=using,dx=dx))
self.pitarray = pits
cvm_result = skgof.cvm_test(pits, stats.uniform())
return cvm_result.statistic, cvm_result.pvalue
def AD(self, using, dx=0.0001, vmin=0.005, vmax=0.995):
"""
Compute the Anderson-Darling statistic and p-value for the PIT
values by comparing with a uniform distribution between 0 and 1.
Since the statistic diverges at 0 and 1, PIT values too close to
0 or 1 are discarded.
Parameters:
-----------
using: string
which parameterization to evaluate
dx: float
step size for integral
vmin, vmax: floats
PIT values outside this range are discarded
Returns:
--------
AD statistic and pvalue
"""
if self.pitarray is not None:
pits = np.array(self.pitarray)
else:
pits = np.array(self.PIT(using=using,dx=dx))
self.pitarray = pits
mask = (pits>vmin) & (pits<vmax)
print "now with proper uniform range"
delv = vmax-vmin
ad_result = skgof.ad_test(pits[mask], stats.uniform(loc=vmin,scale=delv))
return ad_result.statistic, ad_result.pvalue
class NzSumEvaluateMetric(object):
def __init__(self,ensemble_obj,truth_vals,eval_grid=None,using='gridded',dx=0.001):
"""an object that takes a qp Ensemble of N PDF objects and an array of
N "truth" specz's, will be used to calculate PIT and QQ, plus more stuff
later
Parameters:
ensemble_obj: qp ensemble object
a qp ensemble object of N PDFs that will be stacked
truths: numpy array of N true spec-z values
1D numpy array with the N spec-z values
eval_grid: the numpy array to evaluate the metrics on. If fed "None"
will default to np.arange(0.005,2.12,0.01), i.e. the grid for BPZ
"""
# if stackpz_obj==None or truth_vals==None:
# print 'Warning: inputs not complete'
self.ensemble_obj = ensemble_obj
self.truth = truth_vals
if eval_grid is None:
self.eval_grid = np.arange(0.005,2.12,0.01)
print "using default evaluation grid of numpy.arange(0.005,2.12,0.01)\n"
else:
self.eval_grid = eval_grid
self.using=using
self.dx = dx
#make a stack of the ensemble object evaluated at the eval_grid points
stacked = self.ensemble_obj.stack(loc=self.eval_grid,using='gridded')
self.stackpz = qp.PDF(gridded=(stacked['gridded'][0],stacked['gridded'][1]))
return
def WriteNzSumVec(self, newgrid,outfile="default_Nzoutput.dat"):
"""
write the N(z) vector stored in self.stackpz, evaluated on the array
newgrid to output file outfile
Also saves the vectors in self.pzsumvec
Parameters:
-----------
newgrid: numpy array of values at which to evaluate stackpz
outfile: string, name of outpute to write data to
"""
pzsumvec = self.stackpz.evaluate(newgrid,'gridded',False,False)
self.pzsumvec=pzsumvec
np.savetxt(outfile,zip(pzsumvec[0],pzsumvec[1]),fmt="%.5g %.5g")
return
def NZKS(self):
"""
Compute the Kolmogorov-Smirnov statistic and p-value for the
two distributions of sumpz and true_z
Parameters:
-----------
using: string
which parameterization to evaluate
dx: float
step size for integral
Returns:
--------
KS statistic and pvalue
"""
#copy the form of Rongpu's use of skgof functions
#will have to use QPPDFCDF class, as those expect objects
#that have a .cdf method for a vector of values
tmpnzfunc = QPPDFCDF(self.stackpz)
nzks = skgof.ks_test(self.truth,tmpnzfunc)
return nzks.statistic, nzks.pvalue
# #copy Kartheik's metric qp.PDF.evaluate returns the array points as
# #[0] and the values as [1], so pull out just the values
# p1obs = self.stackpz.evaluate(self.eval_grid,using=using, vb=False)[1]
# p2truth = self.truth.evaluate(self.eval_grid,using=using, vb=False)[1]
#
# ks_stat, ks_pval = stats.ks_2samp(p1obs,p2truth)
#
# return ks_stat, ks_pval
def NZCVM(self):
"""
Compute the CramervonMises statistic and p-value for the
two distributions of sumpz and true_z vector of spec-z's
Parameters:
-----------
using: string
which parameterization to evaluate
Returns:
--------
CvM statistic and pvalue
"""
#copy the form of Rongpu's use of skgof functions
#will have to use QPPDFCDF class, as those expect objects
#that have a .cdf method for a vector of values
tmpnzfunc = QPPDFCDF(self.stackpz)
nzCvM = skgof.cvm_test(self.truth,tmpnzfunc)
return nzCvM.statistic, nzCvM.pvalue
def NZAD(self, vmin = 0.005, vmax = 1.995, delv = 0.05):
"""
Compute the Anderson Darling statistic and p-value for the
two distributions of sumpz and true_z vector of spec-z's
Since the Anderson Darling test requires a properly normalized
distribution over the [vmin,vmax] range, will need to create
a new qp object defined on the range np.arange(vmin,vmax+delv,delv)
Parameters:
vmin, vmax: specz values outside of these values are discarded
delz: grid spacing for [vmin,vmax] interval to create new qp
object
-----------
using: string
which parameterization to evaluate
Returns:
--------
Anderson-Darling statistic and pvalue
"""
#copy the form of Rongpu's use of skgof functions
#will have to use QPPDFCDF class, as those expect objects
#that have a .cdf method for a vector of values
print "using %f and %f for vmin and vmax\n"%(vmin,vmax)
szs = self.truth
mask = (szs > vmin) & (szs < vmax)
vgrid = np.arange(vmin,vmax+delv,delv)
veval = self.stackpz.evaluate(vgrid,'gridded',True,False)
vobj = qp.PDF(gridded = (veval[0],veval[1]))
tmpnzfunc = QPPDFCDF(vobj,self.dx)
nzAD = skgof.ad_test(szs[mask],tmpnzfunc)
return nzAD.statistic, nzAD.pvalue
class QPPDFCDF(object):
def __init__(self,pdf_obj,dx=0.0001):
"""an quick wrapper for a qp.PDF object that has .pdf and .cdf
functions for use with skgof functions
pdf_obj: qp pdf object with using='gridded' parameterization
"""
self.pdf_obj = pdf_obj
self.dx = dx
return
def pdf(self,grid):
"""
returns pdf of qp.PDF object by calling qp.PDF.evaluate
inputs:
grid:: float or np ndarray of values to evaluate the pdf at
returns:
pdf of object evaluated at points in grid
"""
return self.pdf_obj.evaluate(grid,'gridded',True,False)[1]
def cdf(self,xvals):
"""
returns CDF of qp.PDF object by calling qp.PDF.integrate
with limits between 0.0 and loc
inputs:
vals: float or np ndarray of values to evaluate the pdf at
Returns:
the array of cdf values evaluated at vals
"""
vals = np.array(xvals)
if vals.size ==1:
lims = (0.0,xvals)
cdfs = self.pdf_obj.integrate(lims,self.dx,'gridded',False)
else:
nval = len(vals)
cdfs = np.zeros(nval)
for i in range(nval):
lims = (0.0,vals[i])
cdfs[i] = self.pdf_obj.integrate(lims,self.dx,'gridded',False)
return cdfs