forked from dhrou/HEPMLtutorials
-
Notifications
You must be signed in to change notification settings - Fork 0
/
higgsml_syst.py
590 lines (469 loc) · 20.7 KB
/
higgsml_syst.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import division
from __future__ import print_function
from __future__ import absolute_import
__doc__="""
ATLAS Higgs Machine Learning Challenge 2014
Read CERN Open Data Portal Dataset http://opendata.cern.ch/record/328
and manipulate it
- KaggleWeight and KaggleSet are removed
- Label is changd from charcter to integer 0 or 1
- DetailLabel is introduced indicating subpopulations
- systematics effect are simulated
- bkg_weight_norm : manipulates the background weight of the W background
- tau_energy_scale : manipulates PRI_had_pt and recompute other quantities accordingly
Some WARNING : variable DER_mass_MMC is not properly manipulated (modification is linearised),
and I advocate to NOT use DER_mass_MMC when doSystTauEnergyScale is enabled
There is a threshold in the original HiggsML file at 20GeV on PRI_had_energy.
This threshold is moved when changing sysTauEnergyScale which is unphysicsal.
So if you're going to play with sysTauEnergyScale (within 0.9-1.1),
I suggest you remove events below say 22 GeV *after* the manipulation
applying doSystTauEnergyScale with sysTauENergyScale=1. does NOT yield identical results as not applyield
doSystTauEnergyScale, this is because of rounding error and zero mass approximation.
doSysTauEnerbyScale impacts PRI_had_pt as well as PRI_met and PRI_met_phi
- so overall I suggest that when playing with doSystTauEnergyScale, the reference is
- not using DER_mass_MMC
- applying *after* this manipulation PRI_had_pt>22
- run with sysTauENergyScale=1. to have the reference
Author D. Rousseau LAL, Nov 2016
Modification Dec 2016 (V. Estrade):
- Wrap everything into separated functions.
- V4 class now handle 1D-vector values (to improve computation efficiency).
- Fix compatibility with both python 2 and 3.
- Use pandas.DataFrame to ease computation along columns
- Loading function for the base HiggsML dataset (fetch it on the internet if needed)
Refactor March 2017 (V. Estrade):
- Split load function (cleaner)
July 06 2017 (V. Estrade):
- Add normalization_weight function
May 2019 (D. Rousseau) :
- Major hack, in preparation for Centralesupelec EI,
python syst/datawarehouse/datawarehouse/higgsml.py -i atlas-higgs-challenge-2014-v2.csv.gz -o atlas-higgs-challenge-2014-v2-s0.csv
python higgsml_syst.py -i atlas-higgs-challenge-2014-v2.csv.gz -o atlas-higgs-challenge-2014-v2-syst1.csv --csv -p --BKGnorm 1. --Wnorm 1. --tes 1. --jes 1. --softMET 0. --seed 31415926
python higgsml_syst.py --help # for command help
reasonable values for parameters
BKGnorm : 1.05
Wnorm : 1.5
tes : 1.03
jes : 1.03
softMET : 3 GeV
"""
__version__ = "4.0"
__author__ = "David Rousseau, and Victor Estrade "
import sys
import os
import gzip
import copy
import pandas as pd
import numpy as np
def load_higgs(in_file):
filename=in_file
print ("filename=",filename)
data = pd.read_csv(filename)
return data
# ==================================================================================
# V4 Class and physic computations
# ==================================================================================
class V4:
"""
A simple 4-vector class to ease calculation, work easy peasy on numpy vector of 4 vector
"""
px=0
py=0
pz=0
e=0
def __init__(self,apx=0., apy=0., apz=0., ae=0.):
"""
Constructor with 4 coordinates
"""
self.px = apx
self.py = apy
self.pz = apz
self.e = ae
if self.e + 1e-3 < self.p():
raise ValueError("Energy is too small! Energy: {}, p: {}".format(self.e, self.p()))
def copy(self):
return copy.deepcopy(self)
def p2(self):
return self.px**2 + self.py**2 + self.pz**2
def p(self):
return np.sqrt(self.p2())
def pt2(self):
return self.px**2 + self.py**2
def pt(self):
return np.sqrt(self.pt2())
def m(self):
return np.sqrt( np.abs( self.e**2 - self.p2() ) ) # abs is needed for protection
def eta(self):
return np.arcsinh( self.pz/self.pt() )
def phi(self):
return np.arctan2(self.py, self.px)
def deltaPhi(self, v):
"""delta phi with another v"""
return (self.phi() - v.phi() + 3*np.pi) % (2*np.pi) - np.pi
def deltaEta(self,v):
"""delta eta with another v"""
return self.eta()-v.eta()
def deltaR(self,v):
"""delta R with another v"""
return np.sqrt(self.deltaPhi(v)**2+self.deltaEta(v)**2 )
def eWithM(self,m=0.):
"""recompute e given m"""
return np.sqrt(self.p2()+m**2)
# FIXME this gives ugly prints with 1D-arrays
def __str__(self):
return "PxPyPzE( %s,%s,%s,%s)<=>PtEtaPhiM( %s,%s,%s,%s) " % (self.px, self.py,self.pz,self.e,self.pt(),self.eta(),self.phi(),self.m())
def scale(self,factor=1.): # scale
"""Apply a simple scaling"""
self.px *= factor
self.py *= factor
self.pz *= factor
self.e = np.abs( factor*self.e )
def scaleFixedM(self,factor=1.):
"""Scale (keeping mass unchanged)"""
m = self.m()
self.px *= factor
self.py *= factor
self.pz *= factor
self.e = self.eWithM(m)
def setPtEtaPhiM(self, pt=0., eta=0., phi=0., m=0):
"""Re-initialize with : pt, eta, phi and m"""
self.px = pt*np.cos(phi)
self.py = pt*np.sin(phi)
self.pz = pt*np.sinh(eta)
self.e = self.eWithM(m)
def sum(self, v):
"""Add another V4 into self"""
self.px += v.px
self.py += v.py
self.pz += v.pz
self.e += v.e
def __iadd__(self, other):
"""Add another V4 into self"""
try:
self.px += other.px
self.py += other.py
self.pz += other.pz
self.e += other.e
except AttributeError:
# If 'other' is not V4 like object then return special NotImplemented error
return NotImplemented
return self
def __add__(self, other):
"""Add 2 V4 vectors : v3 = v1 + v2 = v1.__add__(v2)"""
copy = self.copy()
try:
copy.px += other.px
copy.py += other.py
copy.pz += other.pz
copy.e += other.e
except AttributeError:
# If 'other' is not V4 like object then return special NotImplemented error
return NotImplemented
return copy
# ==================================================================================
def getDetailLabel(origWeight, Label, num=True):
"""
Given original weight and label,
return more precise label specifying the original simulation type.
Args
----
origWeight: the original weight of the event
Label : the label of the event (can be {"b", "s"} or {0,1})
num: (default=True) if True use the numeric detail labels
else use the string detail labels. You should prefer numeric labels.
Return
------
detailLabel: the corresponding detail label ("W" is the default if not found)
Note : Could be better optimized but this is fast enough.
"""
# prefer numeric detail label
detail_label_num={
57207:0, # Signal
4613:1,
8145:2,
4610:3,
917703: 105, #Z
5127399:111,
4435976:112,
4187604:113,
2407146:114,
1307751:115,
944596:122,
936590:123,
1093224:124,
225326:132,
217575:133,
195328:134,
254338:135,
2268701:300 #T
}
# complementary for W detaillabeldict=200
#previous alphanumeric detail label
detail_label_str={
57207:"S0",
4613:"S1",
8145:"S2",
4610:"S3",
917703:"Z05",
5127399:"Z11",
4435976:"Z12",
4187604:"Z13",
2407146:"Z14",
1307751:"Z15",
944596:"Z22",
936590:"Z23",
1093224:"Z24",
225326:"Z32",
217575:"Z33",
195328:"Z34",
254338:"Z35",
2268701:"W" # was T
}
if num:
detailLabelDict = detail_label_num
defaultLabel=400 # 400 "T" is the default value if not found
else:
detailLabelDict = detail_label_str
defaultLabel="T" #"T" is the default value if not found
iWeight=int(1e7*origWeight+0.5)
detailLabel = detailLabelDict.get(iWeight, defaultLabel)
if detailLabel == "T" and (Label != 0 and Label != 'b') :
raise ValueError("ERROR! if not in detailLabelDict sould have Label==1 ({}, {})".format(iWeight,Label))
return detailLabel
def w_bkg_weight_norm(data, systBkgNorm):
"""
Apply a scaling to the weight. For W background
Args
----
data: the dataset should be a pandas.DataFrame like object.
This function will modify the given data inplace.
"""
# scale the weight, arbitrary but reasonable value
data["Weight"] = ( data["Weight"]*systBkgNorm ).where(data["detailLabel"] == 300, other=data["Weight"])
def all_bkg_weight_norm(data, systBkgNorm):
"""
Apply a scaling to the weight.
Args
----
data: the dataset should be a pandas.DataFrame like object.
This function will modify the given data inplace.
"""
# scale the weight, arbitrary but reasonable value
data["Weight"] = ( data["Weight"]*systBkgNorm ).where(data["Label"] == 0, other=data["Weight"])
# ==================================================================================
# Manipulate the 4-momenta
# ==================================================================================
def mom4_manipulate (data, systTauEnergyScale, systJetEnergyScale,softMET):
"""
Manipulate primary inputs : the PRI_had_pt PRI_jet_leading_pt PRI_jet_subleading_pt and recompute the others values accordingly.
Args
----
data: the dataset should be a pandas.DataFrame like object.
This function will modify the given data inplace.
systTauEnergyScale : the factor applied : PRI_had_pt <-- PRI_had_pt * systTauEnergyScale
systJetEnergyScale : the factor applied : all jet pt * systJetEnergyScale
recompute MET accordingly
Add soft MET gaussian random energy
Notes :
-------
Recompute :
- PRI_had_pt
- PRI_jet_leading_pt
- PRI_jet_subleading_pt
- PRI_met
- PRI_met_phi
- PRI_met_sumet
Round up to 3 decimals.
"""
vmet = V4() # met 4-vector
vmet.setPtEtaPhiM(data["PRI_met"], 0., data["PRI_met_phi"], 0.) # met mass zero,
met_sumet=data["PRI_met_sumet"]
if systTauEnergyScale!=1.:
# scale tau energy scale, arbitrary but reasonable value
data["PRI_had_pt"] *= systTauEnergyScale
# build 4-vectors
vtau = V4() # tau 4-vector
vtau.setPtEtaPhiM(data["PRI_had_pt"], data["PRI_had_eta"], data["PRI_had_phi"], 0.8) # tau mass 0.8 like in original
#vlep = V4() # lepton 4-vector
#vlep.setPtEtaPhiM(data["PRI_lep_pt"], data["PRI_lep_eta"], data["PRI_lep_phi"], 0.) # lep mass 0 (either 0.106 or 0.0005 but info is lost)
# fix MET according to tau pt change (minus sign since met is minus sum pt of visible particles
vtauDeltaMinus = vtau.copy()
vtauDeltaMinus.scaleFixedM( (1.-systTauEnergyScale)/systTauEnergyScale )
vmet += vtauDeltaMinus
vmet.pz = 0.
vmet.e = vmet.eWithM(0.)
#met_sum_et is increased if energy scale increased
tauDeltaMinus=vtau.pt()
met_sumet+= (systTauEnergyScale-1)/systTauEnergyScale *tauDeltaMinus
# scale jet energy scale, arbitrary but reasonable value
if systJetEnergyScale!=1. :
#data["PRI_jet_leading_pt"] *= systJetEnergyScale
data["PRI_jet_leading_pt"] = np.where(data["PRI_jet_num"] >0,
data["PRI_jet_leading_pt"]*systJetEnergyScale,
data["PRI_jet_leading_pt"])
#data["PRI_jet_subleading_pt"] *= systJetEnergyScale
data["PRI_jet_subleading_pt"] = np.where(data["PRI_jet_num"] >1,
data["PRI_jet_subleading_pt"]*systJetEnergyScale,
data["PRI_jet_subleading_pt"])
data["PRI_jet_all_pt"] *= systJetEnergyScale
jet_all_pt= data["PRI_jet_all_pt"]
#met_sum_et is increased if energy scale increased
met_sumet+= (systJetEnergyScale-1)/systJetEnergyScale *jet_all_pt
# first jet if it exists
vj1 = V4()
vj1.setPtEtaPhiM(data["PRI_jet_leading_pt"].where( data["PRI_jet_num"] > 0, other=0 ),
data["PRI_jet_leading_eta"].where( data["PRI_jet_num"] > 0, other=0 ),
data["PRI_jet_leading_phi"].where( data["PRI_jet_num"] > 0, other=0 ),
0.) # zero mass
# fix MET according to leading jet pt change
vj1DeltaMinus = vj1.copy()
vj1DeltaMinus.scaleFixedM( (1.-systJetEnergyScale)/systJetEnergyScale )
vmet += vj1DeltaMinus
vmet.pz = 0.
vmet.e = vmet.eWithM(0.)
# second jet if it exists
vj2=V4()
vj2.setPtEtaPhiM(data["PRI_jet_subleading_pt"].where( data["PRI_jet_num"] > 1, other=0 ),
data["PRI_jet_subleading_eta"].where( data["PRI_jet_num"] > 1, other=0 ),
data["PRI_jet_subleading_phi"].where( data["PRI_jet_num"] > 1, other=0 ),
0.) # zero mass
# fix MET according to leading jet pt change
vj2DeltaMinus = vj2.copy()
vj2DeltaMinus.scaleFixedM( (1.-systJetEnergyScale)/systJetEnergyScale )
vmet += vj2DeltaMinus
vmet.pz = 0.
vmet.e = vmet.eWithM(0.)
#note that in principle we should also fix MET for the third jet or more but we do not have enough information
if softMET>0:
# add soft met term
# Compute the missing v4 vector
random_state = np.random.RandomState(seed=seed)
SIZE = data.shape[0]
v4_soft_term = V4()
v4_soft_term.px = random_state.normal(0, softMET, size=SIZE)
v4_soft_term.py = random_state.normal(0, softMET, size=SIZE)
v4_soft_term.pz = np.zeros(SIZE)
v4_soft_term.e = v4_soft_term.eWithM(0.)
# fix MET according to soft term
vmet = vmet + v4_soft_term
data["PRI_met"] = vmet.pt()
data["PRI_met_phi"] = vmet.phi()
data["PRI_met_sumet"] = met_sumet
# Fix precision to 3 decimals
DECIMALS = 3
data["PRI_had_pt"] = data["PRI_had_pt"].round(decimals=DECIMALS)
data["PRI_had_eta"] = data["PRI_had_eta"].round(decimals=DECIMALS)
data["PRI_had_phi"] = data["PRI_had_phi"].round(decimals=DECIMALS)
data["PRI_lep_pt"] = data["PRI_lep_pt"].round(decimals=DECIMALS)
data["PRI_lep_eta"] = data["PRI_lep_eta"].round(decimals=DECIMALS)
data["PRI_lep_phi"] = data["PRI_lep_phi"].round(decimals=DECIMALS)
data["PRI_met"] = data["PRI_met"].round(decimals=DECIMALS)
data["PRI_met_phi"] = data["PRI_met_phi"].round(decimals=DECIMALS)
data["PRI_met_sumet"] = data["PRI_met_sumet"].round(decimals=DECIMALS)
data["PRI_jet_leading_pt"] = data["PRI_jet_leading_pt"].round(decimals=DECIMALS)
data["PRI_jet_leading_eta"] = data["PRI_jet_leading_eta"].round(decimals=DECIMALS)
data["PRI_jet_leading_phi"] = data["PRI_jet_leading_phi"].round(decimals=DECIMALS)
data["PRI_jet_subleading_pt"] = data["PRI_jet_subleading_pt"].round(decimals=DECIMALS)
data["PRI_jet_subleading_eta"] = data["PRI_jet_subleading_eta"].round(decimals=DECIMALS)
data["PRI_jet_subleading_phi"] = data["PRI_jet_subleading_phi"].round(decimals=DECIMALS)
data["PRI_jet_all_pt"] = data["PRI_jet_all_pt"].round(decimals=DECIMALS)
# ==================================================================================
# MAIN : here is defined the behaviour of this module as a main script
# ==================================================================================
import argparse
def parse_args():
"""
ArgumentParser.
Return
------
args: the parsed arguments.
"""
# First create a parser with a short description of the program.
# The parser will automatically handle the usual stuff like the --help messages.
parser = argparse.ArgumentParser(
description="Higgs manipulation script. Can be used to produce new dataset with skewed features.")
# ====================================================================================
# Real things here
# ====================================================================================
parser.add_argument("--quiet", "-q", help="Verbosity level", action="store_true", dest='quiet')
parser.add_argument("--debug", "-d", help="If debug", default=False, action="store_true", dest='debug')
parser.add_argument("--csv", help="Option flag to prevent compression into gzip file.",
action="store_true", dest='csv')
parser.add_argument('--BKGnorm', help='All Background Weight norm scale factor. Reasonable value [0.9, 1.1]. Default 1.', default=1.,type=float, dest='bkg_scale')
parser.add_argument('--Wnorm', help='W Background Weight norm scale factor. Reasonable value [0.2, 5.] Default 1.',default=1., type=float, dest='w_scale')
parser.add_argument('--tes', help='Tau energy scale factor. Reasonable value [0.9, 1.1]. Default 1.', default=1., type=float, dest='tes')
parser.add_argument('--jes', help='Jet energy scale factor. Reasonable value [0.9, 1.1]. Default 1.', default=1.,type=float, dest='jes')
parser.add_argument('--softMET', help='soft MET term. Reasonable value [0.,10.]. Default 0.', default=0., type=float, dest='softMET')
parser.add_argument('--seed', help='random seed for soft MET term. Default 31415926 ', default=31415926, type=int, dest='seed')
parser.add_argument('-i', help='the name of the input file', default="atlas-higgs-challenge-2014-v2-s0_e500000.csv.gz", dest="in_file")
parser.add_argument('-o', default="higgs_output.csv", help='the name of the output file', dest="out_file")
# Now do your job and parse my command line !
args = parser.parse_args()
return args
if __name__ == '__main__':
args = parse_args()
quiet = args.quiet # quiet flag
csv = args.csv # csv flag
w_scale = args.w_scale # W bkg Weight scaling factor
bkg_scale = args.bkg_scale # ALl bkg Weight scaling factor
tes = args.tes # Tau energy scale factor
jes = args.jes # ket energy scale factor
softMET = args.softMET # soft Missing Energy Term
seed = args.seed # random seed for soft met term
in_file = args.in_file # input file
out_file = args.out_file # output file
debug = args.debug
columns = [ "EventId",
"PRI_had_pt",
"PRI_had_eta",
"PRI_had_phi",
"PRI_lep_pt",
"PRI_lep_eta",
"PRI_lep_phi",
"PRI_met",
"PRI_met_phi",
"PRI_met_sumet",
"PRI_jet_num",
"PRI_jet_leading_pt",
"PRI_jet_leading_eta",
"PRI_jet_leading_phi",
"PRI_jet_subleading_pt",
"PRI_jet_subleading_eta",
"PRI_jet_subleading_phi",
"PRI_jet_all_pt",
"Weight",
"Label",
"detailLabel",
]
if not quiet:
print("Loading the dataset")
data = load_higgs(in_file)
if w_scale is not None:
if not quiet:
print("W bkg weight rescaling :", w_scale)
w_bkg_weight_norm(data, w_scale)
if bkg_scale is not None:
if not quiet:
print("All bkg weight rescaling :", bkg_scale)
all_bkg_weight_norm(data, bkg_scale)
if tes is not None or jes is not None or softMET is not None:
if not quiet:
print("Tau energy rescaling :", tes)
print("Jet energy rescaling :", jes)
print("Soft MET addition :", softMET)
mom4_manipulate(data, tes,jes,softMET)
compression = None if csv else "gzip"
_, ext = os.path.splitext(out_file)
if ext != ".csv":
out_file += ".csv"
if not csv and ext != ".gz":
out_file += ".gz"
if not quiet:
print("Writing results to :", out_file)
if debug:
print ("only take first few rows for debug")
data=data.head(50)
data.to_csv(out_file, index=False, compression=compression, columns=columns)
print("Done.")