-
Notifications
You must be signed in to change notification settings - Fork 1
/
backprop.m
111 lines (102 loc) · 3.62 KB
/
backprop.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
function [dW,db,f] = backprop(W, b, transfer, input, target, loss, varargin)
% BACKPROP Compute gradient of a feed-forward neural network.
% [dW,db] = BACKPROP(W, b, transfer, input, target, ...) computes the
% gradient of a feed-forward network given by corresponding cell lists of
% weights (W), biases (b) and transfer functions (transfer)
%
% [dW,db,f] = BACKPROP(...) also outputs the loss
%
% Name value pair options (default value):
%
% 'DropoutRate' (0): Set this to a number > 0 and < 1 for dropout
% for the hidden layers, where 0 means no dropout.
%
% 'Normalization' ('full'): normalization factor. If 'full', then the
% loss/derivatives are divided by the total number of elements in
% target/output - this is the MATLAB default. If 'batch', then we
% divide by the number of columns
% Get opts
p = inputParser;
p.CaseSensitive = false;
p.addParameter('DropoutRate', 0, @isfloat)
p.addParameter('Normalization', 'full', @ischar)
p.parse(varargin{:});
dropout = p.Results.DropoutRate;
assert(dropout < 1, 'Dropout rate must be < 1!');
normalization = p.Results.Normalization;
numLayers = length(W);
% numWeightElements = sum(cellfun(@numel, W)) + sum(cellfun(@numel, b));
%% Forward pass
% Layer outputs and derivatives
o = cell(numLayers, 1);
do = cell(numLayers, 1);
% Forward propagate
for i = 1:numLayers
if i == 1
a = bsxfun(@plus, W{i} * input, b{i});
else
a = bsxfun(@plus, W{i} * o{i-1}, b{i});
end
o{i} = feval(transfer{i}, a);
do{i} = feval(transfer{i}, 'dn', a);
% % TODO: This is the derivative in the softmax case - and it's slow as
% % hell on GPU
% if strcmp(transfer{i}, 'softmax')
% [dim,N] = size(o{i});
% tmp = cell(1,N);
% for j=1:N
% oj = o{i}(:,j);
% tmp{j} = gpuArray(zeros(dim,dim));
% for r=1:dim
% for c=1:dim
% if r==c
% tmp{j}(r,c) = oj(r)*(1-oj(r));
% else
% tmp{j}(r,c) = -oj(r)*oj(c);
% end
% end
% end
% end
% end
% Dropout hidden layers
% https://gist.github.com/ottokart/ebd3d32438c13a62ea3c
if dropout > 0 && i < numLayers
mask = binornd(1, 1-dropout, size(a));
o{i} = mask .* o{i} / (1-dropout);
do{i} = mask .* do{i} / (1-dropout);
end
end
%% Backward pass
% Output error
if any(strcmp(loss, {'crossentropy', 'log', 'binary_crossentropy', 'crossentropy_binary'}))
assert(any(strcmp(transfer{end}, {'logsig', 'softmax'})), 'Cross-entropy loss function requires logistic or softmax output units!')
end
if nargout < 3
[~,delta] = backprop_loss(target, o{end}, loss, 'Normalization', normalization);
else
[f,delta] = backprop_loss(target, o{end}, loss, 'Normalization', normalization);
end
% Backpropagate
dW = cell(1, numLayers);
db = cell(1, numLayers);
for i = numLayers:-1:1
% Delta
if i == numLayers % Output layer
if strcmp(transfer{i}, 'softmax') % Softmax outputs cells with a Jacobian per sample
for j = 1:length(do{i}), delta(:,j) = do{i}{j} * delta(:,j); end
else
delta = do{i} .* delta;
end
else % Input or hidden layer
if strcmp(transfer{i}, 'softmax'), error('Softmax transfer function only supported for the output layer!'); end
delta = do{i} .* (W{i+1}' * delta);
end
% Weight update
if i > 1 % Hidden or output layer
dW{i} = delta * o{i-1}';
else % Input layer
dW{i} = delta * input';
end
% Bias update
db{i} = sum(delta, 2);
end