forked from uci-igb/higgs-susy
-
Notifications
You must be signed in to change notification settings - Fork 1
/
physics.py
154 lines (129 loc) · 4.49 KB
/
physics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
# Pylearn2 dataset for physics data.
#__authors__ = "Peter Sadowski"
# May 2014
from pylearn2.datasets import dense_design_matrix
from pylearn2.datasets import control
from pylearn2.utils import serial
import os
import numpy as np
import pickle as pkl
import csv
class PHYSICS(dense_design_matrix.DenseDesignMatrix):
def __init__(self,
which_set,
benchmark,
derived_feat=True,
version='',
seed=None, # Randomize data order if seed is not None
start=0,
stop=np.inf):
self.args = locals()
path = os.environ['PYLEARN2_DATA_PATH']
if derived_feat == 'False':
derived_feat = False
elif derived_feat == 'True':
derived_feat = True
if benchmark == 1:
inputfile = '%s/HIGGS.csv' % path
elif benchmark == 2:
inputfile = '%s/SUSY.csv' % path
# Need to allocate two arrays X (inputs) and y (targets)
# We know the size so let's allocate them from the outset instead
# Define set of training, testing, and validation sets
if benchmark == 1:
# HIGGS
ntrain = 10000000
nvalid = 500000
ntest = 500000
elif benchmark == 2:
# SUSY
ntrain = 4000000
nvalid = 500000
ntest = 500000
if which_set == 'train':
offset = 0
nrows = ntrain
elif which_set == 'valid':
offset = ntrain
nrows = nvalid
elif which_set == 'test':
offset = ntrain+nvalid
nrows = ntest
# Define the feature lists and relevant columns
if benchmark == 1:
if derived_feat == 'only':
xcolmin = 22
xcolmax = 29
ycolmin = 0
ycolmax = 1
elif not derived_feat:
xcolmin = 1
xcolmax = 22
ycolmin = 0
ycolmax = 1
else:
xcolmin = 1
xcolmax = 29
ycolmin = 0
ycolmax = 1
elif benchmark == 2:
if derived_feat == 'only':
xcolmin = 9
xcolmax = 19
ycolmin = 0
ycolmax = 1
if not derived_feat:
xcolmin = 1
xcolmax = 9
ycolmin = 0
ycolmax = 1
else:
xcolmin = 1
xcolmax = 19
ycolmin = 0
ycolmax = 1
# Limit number of samples
stop = min(stop,nrows)
X = np.empty([stop,xcolmax-xcolmin], dtype='float32')
y = np.empty([stop,ycolmax-ycolmin], dtype='float32')
# Randomize data order.
indices = np.arange(X.shape[0])
if seed:
rng = np.random.RandomState(42) # reproducible results with a fixed seed
rng.shuffle(indices)
# Finally, let's read in the data
reader = csv.reader(open(inputfile))
nskipped = 0
nread = 0
for row in reader:
# Possibly skip some events at the start
if nskipped < offset:
nskipped += 1
continue
# Stop when we get to the number we want
if nread >= stop:
break
# If we're here, it means we want to save this result
X[indices[nread]] = row[xcolmin:xcolmax]
y[indices[nread]] = row[ycolmin:ycolmax]
nread += 1
print 'Data loaded: benchmark {} ({})'.format(benchmark,which_set)
# Initialize the superclass. DenseDesignMatrix
super(PHYSICS,self).__init__(X=X, y=y)
def standardize(self, X):
"""
Standardize each feature:
1) If data contains negative values, we assume its either normally or uniformly distributed, center, and standardize.
2) elseif data has large values, we set mean to 1.
"""
for j in range(X.shape[1]):
vec = X[:, j]
if np.min(vec) < 0:
# Assume data is Gaussian or uniform -- center and standardize.
vec = vec - np.mean(vec)
vec = vec / np.std(vec)
elif np.max(vec) > 1.0:
# Assume data is exponential -- just set mean to 1.
vec = vec / np.mean(vec)
X[:,j] = vec
return X