forked from vitoriapacela/RegressionLCD
-
Notifications
You must be signed in to change notification settings - Fork 1
/
preprocessing.py
187 lines (154 loc) · 6.09 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
'''
preprocessing.py
Contains custom utilities for preprocessing raw data.
Author: Vitoria Barin Pacela
e-mail: [email protected]
'''
import os, sys, glob, h5py
import numpy as np
if __package__ is None:
sys.path.append(os.path.realpath("/data/shared/Software/CMS_Deep_Learning"))
from CMS_Deep_Learning.io import simple_grab, nb_samples_from_h5
def reshapeData(inp):
'''
Function to reshape the data, parameter to Danny's generator
:param inp: array containing ECAL (shape (10000, 25, 25, 25)), HCAL (shape (10000, 5, 5, 60)) and target data (shape (10000, 2)).
:type inp: array
:return: ECAL, HCAL and target arrays in the right shape.
'''
(xe, xh), y = inp
energy = [y[:, 1:]]
return (xe, xh), energy
def nSum(directory):
'''
Does not work. Check out new function.
Naive sum of the shower deposits in the ECAL and HCAL.
:param directory: path to the directory with HDF5 files.
:return: sum of the energies in the ECAL and HCAL, respectively.
'''
s_ecal = 0
s_hcal = 0
if (os.path.exists(os.path.abspath(directory)) and os.path.isdir(directory)):
directory = glob.glob(os.path.abspath(directory) + "/*.h5")
for fileName in directory:
inp = h5py.File(fileName, "r")
ecal = np.array(inp["ECAL"], dtype="float32")
hcal = np.array(inp["HCAL"], dtype="float32")
s_ecal += np.sum(np.sum(np.sum(ecal, axis=-1), axis=-1), axis=-1, keepdims=True)
s_hcal += np.sum(np.sum(np.sum(hcal, axis=-1), axis=-1), axis=-1, keepdims=True)
return s_ecal, s_hcal
def nSamples(directory):
'''
Return number of samples in the directory.
:param directory: path to directory that contains the HDF5 files
:type directory: str
:return: number of samples
:rtype: int
'''
import sys
import os
if __package__ is None:
sys.path.append(os.path.realpath("/data/shared/Software/CMS_Deep_Learning"))
from CMS_Deep_Learning.io import nb_samples_from_h5
samples = 0
for f in os.listdir(directory):
samples += nb_samples_from_h5(directory+f)
return samples
def genHsum(generator):
'''
Generator that receives a generator (Danny's) and outputs ECAL, HCAL and the sum over the HCAL cells.
:param generator: gen_from_data(train_dir, batch_size=500, data_keys=[["ECAL", "HCAL"], "target"], prep_func=reshapeData)
:type generator: generator
:return: ECAL, HCAL, HCALsum
:rtype: numpy array with shape (n, 25, 25, 25), array with shape (n, 5, 5, 60), array with shape (n, 1); n is the batch size.
'''
while True:
(ecal, hcal), true = next(generator)
s_hcal = np.sum(np.sum(np.sum(hcal, axis=-1), axis=-1), axis=-1, keepdims=True)
yield [ecal, hcal, s_hcal], true
def _genSum(generator):
'''
-- This function might be buggy, haven't tested yet. --
Generator that receives a generator (Danny's) and outputs ECAL, HCAL and an array containing the sum over the ECAL and HCAL cells.
:param generator: gen_from_data(train_dir, batch_size=500, data_keys=[["ECAL", "HCAL"], "target"], prep_func=reshapeData)
:type generator: generator
:return: ECAL, HCAL, sum[ECAL, HCAL]
:rtype: numpy array with shape (n, 25, 25, 25), array with shape (n, 5, 5, 60), array with shape (n, 2); n is the batch size.
'''
# s = lambda x: np.sum(np.sum(np.sum(x,axis=-1),axis=-1),axis=-1)
s = lambda x: np.sum(np.sum(np.sum(x, axis=-1), axis=-1), axis=-1, keepdims=True)
while True:
(ecal, hcal), true = next(generator)
s_ecal = s(ecal)
s_hcal = s(hcal)
sums = np.array([s_ecal, s_hcal])
reshaped = sums.reshape(500,2)
# print(reshaped.shape)
# print(reshaped)
yield [ecal, hcal, sums], true
def sumCal(cal):
'''
Sum of the energy deposits over the calorimeter.
:type cal: numpy.ndarray, 4D.
:param cal: ECAL or HCAL input.
:return: sum of the energy values
:rtype: numpy.ndarray, 2D.
'''
s_cal = np.sum(np.sum(np.sum(cal, axis=-1), axis=-1), axis=-1, keepdims=True)
return s_cal
def inpSum(dir):
'''
Naive sum of the shower deposits in the ECAL and HCAL.
:type dir: str.
:param dir: path to the directory with HDF5 files.
:return: sum of the ECAL and HCAL sums.
:rtype: numpy.ndarray, shape: (n,)
'''
# grab ECAL and HCAL inputs
ecal, hcal = simple_grab('X', data=dir, label_keys=['ECAL', 'HCAL'], input_keys=['ECAL', 'HCAL'])
# sums
s_ecal = sumCal(ecal)
s_hcal = sumCal(hcal)
# reshape sum output
s_ecal = s_ecal.ravel()
s_hcal = s_hcal.ravel()
# total sum
inSum = s_ecal + s_hcal
return inSum
def preSum(train_dir, particle=""):
'''
To be used before training for data visualization.
Naive sum of the shower deposits in the ECAL and HCAL.
:type train_dir: str.
:parameter train_dir: path to the training directory with HDF5 files.
:type particle: str.
:parameter particle: name of the particle.
:return: energy targets and energy sum arrays.
:rtype: numpy.ndarray, numpy.ndarray; shape: (n,) shape: (n,)
'''
# grab targets (y)
all_y = simple_grab('Y', data=train_dir, label_keys='target',
input_keys=['ECAL', 'HCAL'])
all_y = all_y[:, 1:]
all_y = all_y.ravel()
#print(all_y.shape)
# sum of ECAL and HCAL
inSum = inpSum(train_dir)
# save arrays to HDF5
saveSum_toHDF5(particle, all_y, inSum)
return all_y, inSum
def saveSum_toHDF5(name, true, inSum):
'''
Saves true energy and prediction energy arrays into an HDF5 file.
:parameter name: name of the file to be saved.
:type name: str.
:parameter true: array of energy targets (true value label).
:type true: numpy.ndarray
:parameter inSum: array of predictions from testing.
:type inSum: numpy.ndarray
'''
true_sum = np.array([true, inSum])
# best implementation would be to check if the file exists. if not, create it. being lazy for now.
change_file = h5py.File(name + "TruePred.h5", 'a')
ds = change_file.create_dataset("true_sum", data=true_sum)
change_file.close()