-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbackprop.py
416 lines (346 loc) · 15.4 KB
/
backprop.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
import matplotlib
import matplotlib.pyplot as plt
import mnist
import numpy as np
# Gradient descent optimization
# The learning rate is specified by eta
class GDOptimizer(object):
def __init__(self, eta):
self.eta = eta
def initialize(self, layers):
pass
# This function performs one gradient descent step
# layers is a list of dense layers in the network
# g is a list of gradients going into each layer before the nonlinear activation
# a is a list of outputs from previous layer
def update(self, layers, g, a):
m = a[0].shape[1]
for layer, curGrad, curA in zip(layers, g, a):
# TODO: PART E ###############################################################
# PART E #####################################################################
layer.W -= self.eta * np.dot(curGrad, curA.T) / m
layer.b -= self.eta * np.mean(curGrad, axis=1).reshape((-1,1))
# Cost function used to compute prediction errors
class QuadraticCost(object):
# Compute the squared error between the prediction yp and the observation y
# This method should compute the cost per element such that the output is the
# same shape as y and yp
@staticmethod
def fx(y,yp):
# TODO: PART B ###################################################################
# PART B #########################################################################
return np.square(np.subtract(y,yp)) / 2.0
# Derivative of the cost function with respect to yp
@staticmethod
def dx(y,yp):
# TODO: PART B ###################################################################
# PART B #########################################################################
return np.subtract(yp,y)
# Sigmoid function fully implemented as an example
class SigmoidActivation(object):
@staticmethod
def fx(z):
# PART C Example #################################################################
return 1 / (1 + np.exp(-z))
# PART C #########################################################################
@staticmethod
def dx(z):
# PART C Example #################################################################
# PART C #########################################################################
return np.multiply(fx(z), 1-fx(z))
# Hyperbolic tangent function
class TanhActivation(object):
# Compute tanh for each element in the input z
@staticmethod
def fx(z):
# TODO: PART C ###################################################################
# PART C #########################################################################
return np.tanh(z)
# Compute the derivative of the tanh function with respect to z
@staticmethod
def dx(z):
# TODO: PART C ###################################################################
# PART C #########################################################################
return 1 - np.square(np.tanh(z))
# Rectified linear unit
class ReLUActivation(object):
@staticmethod
def fx(z):
# TODO: PART C ###################################################################
# PART C #########################################################################
return z * (z>0)
@staticmethod
def dx(z):
# TODO: PART C ###################################################################
# PART C #########################################################################
return z > 0
# Linear activation
class LinearActivation(object):
@staticmethod
def fx(z):
# TODO: PART C ###################################################################
# PART C #########################################################################
return z
@staticmethod
def dx(z):
# TODO: PART C ###################################################################
# PART C #########################################################################
return np.ones(z.shape)
# This class represents a single hidden or output layer in the neural network
class DenseLayer(object):
# numNodes: number of hidden units in the layer
# activation: the activation function to use in this layer
def __init__(self, numNodes, activation):
self.numNodes = numNodes
self.activation = activation
def getNumNodes(self):
return self.numNodes
# Initialize the weight matrix of this layer based on the size of the matrix W
def initialize(self, fanIn, scale=1.0):
s = scale * np.sqrt(6.0 / (self.numNodes + fanIn))
self.W = np.random.normal(0, s, (self.numNodes,fanIn))
self.b = np.random.uniform(-1, 1, (self.numNodes,1))
# Apply the activation function of the layer on the input z
def a(self, z):
return self.activation.fx(z)
# Compute the linear part of the layer
# The input a is an n x k matrix where n is the number of samples
# and k is the dimension of the previous layer (or the input to the network)
def z(self, a):
return self.W.dot(a) + self.b # Note, this is implemented where we assume a is k x n
# Compute the derivative of the layer's activation function with respect to z
# where z is the output of the above function.
# This derivative does not contain the derivative of the matrix multiplication
# in the layer. That part is computed below in the model class.
def dx(self, z):
return self.activation.dx(z)
# Update the weights of the layer by adding dW to the weights
def updateWeights(self, dW):
self.W = self.W - dW
# Update the bias of the layer by adding db to the bias
def updateBias(self, db):
self.b = self.b - db
# This class handles stacking layers together to form the completed neural network
class Model(object):
# inputSize: the dimension of the inputs that go into the network
def __init__(self, inputSize):
self.layers = []
self.inputSize = inputSize
# Add a layer to the end of the network
def addLayer(self, layer):
self.layers.append(layer)
# Get the output size of the layer at the given index
def getLayerSize(self, index):
if index >= len(self.layers):
return self.layers[-1].getNumNodes()
elif index < 0:
return self.inputSize
else:
return self.layers[index].getNumNodes()
# Initialize the weights of all of the layers in the network and set the cost
# function to use for optimization
def initialize(self, cost, initializeLayers=True):
self.cost = cost
if initializeLayers:
for i in range(0,len(self.layers)):
if i == len(self.layers) - 1:
self.layers[i].initialize(self.getLayerSize(i-1))
else:
self.layers[i].initialize(self.getLayerSize(i-1))
# Compute the output of the network given some input a
# The matrix a has shape n x k where n is the number of samples and
# k is the dimension
# This function returns
# yp - the output of the network
# a - a list of inputs for each layer of the newtork where
# a[i] is the input to layer i
# (note this does not include the network output!)
# z - a list of values for each layer after evaluating layer.z(a) but
# before evaluating the nonlinear function for the layer
def evaluate(self, x):
curA = x.T
a = [curA]
z = []
for layer in self.layers:
z.append(layer.z(curA))
curA = layer.a(z[-1])
a.append(curA)
yp = a.pop()
return yp, a, z
# Compute the output of the network given some input a
# The matrix a has shape n x k where n is the number of samples and
# k is the dimension
def predict(self, a):
a, _, _ = self.evaluate(a)
return a.T
# Computes the gradients at each layer. y is the true labels, yp is the
# predicted labels, and z is a list of the intermediate values in each
# layer. Returns the gradients and the forward pass outputs (per layer).
#
# In particular, we return a list of dMSE/dz_i. The reasoning behind this is that
# in the update function for the optimizer, we do not give it the z values
# we compute from evaluating the network.
def compute_grad(self, x, y):
# Feed forward, computing outputs of each layer and
# intermediate outputs before the non-linearities
yp, a, z = self.evaluate(x)
# d is inialized here to be (dMSE / dyp)
d = self.cost.dx(y.T, yp)
grad = []
# Backpropogate the error
for layer, curZ in zip(reversed(self.layers), reversed(z)):
# TODO: PART D ###################################################################
# grad[i] should correspond with the gradient of the output of layer i before the
# activation is applied (dMSE / dz_i); be sure values are stored in the correct
# ordering!
# PART D #########################################################################
grad.append(d * layer.dx(curZ))
d = layer.W.T @ grad[-1]
grad = reversed(grad)
return grad, a
# Train the network given the inputs x and the corresponding observations y
# The network should be trained for numEpochs iterations using the supplied
# optimizer
def train(self, x, y, numEpochs, optimizer):
# Initialize some stuff
n = x.shape[0]
x = x.copy()
y = y.copy()
hist = []
optimizer.initialize(self.layers)
# Run for the specified number of epochs
for epoch in range(0,numEpochs):
# Compute the gradients
grad, a = self.compute_grad(x, y)
# Update the network weights
optimizer.update(self.layers, grad, a)
# Compute the error at the end of the epoch
yh = self.predict(x)
C = self.cost.fx(y, yh)
C = np.mean(C)
hist.append(C)
return hist
def trainBatch(self, x, y, batchSize, numEpochs, optimizer):
# Copy the data so that we don't affect the original one when shuffling
x = x.copy()
y = y.copy()
hist = []
n = x.shape[0]
for epoch in np.arange(0,numEpochs):
# Shuffle the data
r = np.arange(0,x.shape[0])
x = x[r,:]
y = y[r,:]
e = []
# Split the data in chunks and run SGD
for i in range(0,n,batchSize):
end = min(i+batchSize,n)
batchX = x[i:end,:]
batchY = y[i:end,:]
e += self.train(batchX, batchY, 1, optimizer)
hist.append(np.mean(e))
return hist
if __name__ == '__main__':
N0 = 3
N1 = 9
# Switch these statements to True to run the code for the corresponding parts
# Part E
SGD = True
# Part F
DIFF_SIZES = True
# Generate the training set
np.random.seed(9001)
y_train = mnist.train_labels()
y_test = mnist.test_labels()
X_train = (mnist.train_images()/255.0)
X_test = (mnist.test_images()/255.0)
train_idxs = np.logical_or(y_train == N0, y_train == N1)
test_idxs = np.logical_or(y_test == N0, y_test == N1)
y_train = y_train[train_idxs].astype('int')
y_test = y_test[test_idxs].astype('int')
X_train = X_train[train_idxs]
X_test = X_test[test_idxs]
y_train = (y_train == N0).astype('int')
y_test = (y_test == N0).astype('int')
y_train *= 2
y_test *= 2
y_train -= 1
y_test -= 1
X_train = X_train.reshape(X_train.shape[0], -1)
X_test = X_test.reshape(X_test.shape[0], -1)
y_test = y_test[:, np.newaxis]
y = y_train[:, np.newaxis]
x = X_train
xLin=np.linspace(-np.pi,np.pi,250).reshape((-1,1))
yHats = {}
activations = dict(ReLU=ReLUActivation,
tanh=TanhActivation,
linear=LinearActivation)
lr = dict(ReLU=0.02,tanh=0.02,linear=0.005)
names = ['ReLU','linear','tanh']
#### PART E ####
if SGD:
print('\n----------------------------------------\n')
print('Using SGD')
for key in names[1:2]:
for batchSize in [10, 50, 100, 200]:
for epoch in [10, 20, 40]:
activation = activations[key]
model = Model(x.shape[1])
model.addLayer(DenseLayer(4,activation()))
model.addLayer(DenseLayer(1,LinearActivation()))
model.initialize(QuadraticCost())
hist = model.trainBatch(x, y, batchSize, epoch, GDOptimizer(eta=lr[key]))
y_hat_train = model.predict(x)
y_pred_train = np.sign(y_hat_train)
y_hat_test = model.predict(X_test)
y_pred_test = np.sign(y_hat_test)
error_train = np.mean(np.square(y_hat_train - y))/2
error_test = np.mean(np.square(y_hat_test - y_test))/2
print('Batch size: ', batchSize ,'Epoch: ', epoch, 'Train Error: ', error_train, 'Test Error: ', error_test)
#### PART F ####
# Train with different sized networks
if DIFF_SIZES:
print('\n----------------------------------------\n')
print('Training with various sized network')
names = ['ReLU', 'tanh']
widths = [2, 4, 8, 16, 32]
errors = {}
for key in names[1:]:
error = []
for width in widths:
activation = activations[key]
model = Model(x.shape[1])
model.addLayer(DenseLayer(width,activation()))
model.addLayer(DenseLayer(1,LinearActivation()))
model.initialize(QuadraticCost())
epochs = 256
hist = model.trainBatch(x,y,x.shape[0],epochs,GDOptimizer(eta=lr[key]))
y_hat_train = model.predict(x)
y_hat_test = model.predict(X_test)
error_train = np.mean(np.square(y_hat_train - y))/2
error_test = np.mean(np.square(y_hat_test - y_test))/2
print(key+' neural nework width('+str(width)+') train MSE: '+str(error_train))
print(key+' neural network width('+str(width)+') test MSE: '+str(error_test))
if DIFF_SIZES:
print('\n----------------------------------------\n')
print('Training with various sized network')
names = ['ReLU', 'tanh']
widths = [2, 4, 8, 16, 32]
errors = {}
for key in names[1:]:
error = []
for width in widths:
activation = activations[key]
model = Model(x.shape[1])
model.addLayer(DenseLayer(width,activation()))
model.addLayer(DenseLayer(1,LinearActivation()))
model.initialize(QuadraticCost())
epochs = 256
hist = model.trainBatch(x,y,x.shape[0],epochs,GDOptimizer(eta=lr[key]))
y_hat_train = model.predict(x)
y_hat_test = model.predict(X_test)
error_train = np.mean(np.square(y_hat_train - y))/2
error_test = np.mean(np.square(y_hat_test - y_test))/2
print(key+' neural nework width('+str(width)+') train MSE: '+str(error_train))
print(key+' neural network width('+str(width)+') test MSE: '+str(error_test))