forked from mbandrews/MLAnalyzer
-
Notifications
You must be signed in to change notification settings - Fork 1
/
preprocess_RH+DIGI_EBcrops.py
100 lines (82 loc) · 2.47 KB
/
preprocess_RH+DIGI_EBcrops.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import numpy as np
import ROOT
from root_numpy import root2array, tree2array
#from root_pandas.readwrite import convert_to_dataframe
from dask.delayed import delayed
import dask.array as da
import dask.dataframe as df
import h5py
#eosDir='/eos/cms/store/user/mandrews/ML/IMGs'
eosDir='/eos/cms/store/user/mandrews/ML/IMGs_RAW'
decays = [
"SinglePhotonPt50",
"SingleElectronPt50"
]
s = 32
crop_size = int(s*s)
#chunk_size = 9960
chunk_size = 3000
n_channels = 14
ver = 1
chunk_shape = (chunk_size,n_channels,crop_size)
@delayed
def process_chunk(x):
E_scale = 50.
t_scale = 50.
# Energy
Efull = x[:,0,:]
#Efull /= E_scale
norm = np.expand_dims(np.linalg.norm(Efull, axis=1), -1)
Efull /= norm
Efull[np.isnan(Efull)] = 0.
# Time
tfull = x[:,1,:]
tfull /= t_scale
# Energy
E = x[:,2,:]
#E /= E_scale
norm = np.expand_dims(np.linalg.norm(E, axis=1), -1)
E /= norm
E[np.isnan(E)] = 0.
# Time
t = x[:,3,:]
t /= t_scale
rh = np.stack([Efull, tfull, E, t], axis=1)
adc_off = 2.3
# Noise
noise = np.mean(x[:,4:7,:], axis=1)
noise = np.expand_dims(noise, axis=1)
# Digi
digi = x[:,4:,:]
#pos = (digi > 0.)
#digi[pos] = np.log10(digi[pos]) - adc_off
digi -= noise
norm = np.expand_dims(np.linalg.norm(digi, axis=2), -1)
digi /= norm
digi[np.isnan(digi)] = 0.
X = np.concatenate([rh, digi], axis=1)
X = X.reshape((-1,X.shape[1],s,s))
X = np.transpose(X, [0,2,3,1])
return X
for j,decay in enumerate(decays):
file_in_str = "%s/%s_IMGCROPS_n249k_pT.hdf5"%(eosDir,decay)
dset = h5py.File(file_in_str)
X_in = da.from_array(dset['/X'], chunks=chunk_shape)
y_in = da.from_array(dset['/y'], chunks=(chunk_size,))
assert X_in.shape[0] == y_in.shape[0]
events = X_in.shape[0]
#events = 10000
assert events % chunk_size == 0
print " >> Doing decay:", decay
print " >> Input file:", file_in_str
print " >> Total events:", events
print " >> Processing..."
X = da.concatenate([da.from_delayed(\
process_chunk(X_in[i:i+chunk_size]),\
shape=(chunk_size,s,s,14),\
dtype=np.float32)\
for i in range(0,events,chunk_size)])
file_out_str = "%s/%s_IMGCROPS_n249k_pT_RHv5+DIGIv6.hdf5"%(eosDir,decay)
print " >> Writing to:", file_out_str
da.to_hdf5(file_out_str, {'/X': X, '/y': y_in}, compression='lzf')
print " >> Done.\n"