From 4be73e25b5db68d94a58c094b6bfd31eece9827a Mon Sep 17 00:00:00 2001
From: umuguc <umuguc@gmail.com>
Date: Sat, 18 Apr 2015 00:28:20 +0200
Subject: [PATCH] mnistAutoencoder

---
 .../mnistAutoencoder/cnn_mnist_autoencoder.m  | 279 +++++++++++++++
 .../cnn_mnist_autoencoder_test_demo.m         |  76 +++++
 .../cnn_mnist_autoencoder_training_demo.m     |  33 ++
 examples/mnistAutoencoder/cnn_train.m         | 320 ++++++++++++++++++
 examples/mnistAutoencoder/cnn_train_adagrad.m | 318 +++++++++++++++++
 examples/mnistAutoencoder/euclideanloss.m     |  26 ++
 examples/mnistAutoencoder/sigmoid.m           |  16 +
 .../sigmoidcrossentropyloss.m                 |  29 ++
 examples/mnistAutoencoder/vl_simplenn.m       | 249 ++++++++++++++
 9 files changed, 1346 insertions(+)
 create mode 100644 examples/mnistAutoencoder/cnn_mnist_autoencoder.m
 create mode 100644 examples/mnistAutoencoder/cnn_mnist_autoencoder_test_demo.m
 create mode 100644 examples/mnistAutoencoder/cnn_mnist_autoencoder_training_demo.m
 create mode 100644 examples/mnistAutoencoder/cnn_train.m
 create mode 100644 examples/mnistAutoencoder/cnn_train_adagrad.m
 create mode 100644 examples/mnistAutoencoder/euclideanloss.m
 create mode 100644 examples/mnistAutoencoder/sigmoid.m
 create mode 100644 examples/mnistAutoencoder/sigmoidcrossentropyloss.m
 create mode 100644 examples/mnistAutoencoder/vl_simplenn.m

diff --git a/examples/mnistAutoencoder/cnn_mnist_autoencoder.m b/examples/mnistAutoencoder/cnn_mnist_autoencoder.m
new file mode 100644
index 00000000..bc5ac0be
--- /dev/null
+++ b/examples/mnistAutoencoder/cnn_mnist_autoencoder.m
@@ -0,0 +1,279 @@
+function [net, opts, imdb, info] = cnn_mnist_autoencoder
+%CNN_MNIST_AUTOENCODER Summary of this function goes here
+%   Detailed explanation goes here
+
+net  = getMnistAutoencoderNet;
+opts = getMnistAutoencoderOpts;
+
+if exist(opts.imdbPath, 'file')
+    
+    load(opts.imdbPath);
+    
+else
+    
+    imdb = getMnistAutoencoderImdb(opts);
+    
+    if ~exist(opts.expDir, 'dir')
+        
+        mkdir(opts.expDir);
+        
+    end
+    
+    save(opts.imdbPath, 'imdb');
+    
+end
+
+% [net, info] = cnn_train(net, imdb, @(imdb, batch) getMnistAutoencoderBatch(imdb, batch), opts);
+[net, info] = cnn_train_adagrad(net, imdb, @(imdb, batch) getMnistAutoencoderBatch(imdb, batch), opts);
+
+net.layers{end} = struct('name', 'data_hat_sigmoid', ...
+                         'type', 'sigmoid'         );
+
+net.layers{end + 1} = struct('type', 'euclideanloss');
+
+end
+
+% -------------------------------------------------------------------------
+function net = getMnistAutoencoderNet
+% -------------------------------------------------------------------------
+
+% Layer 1
+
+net.layers{1} = struct('biases'             , zeros(1, 1000, 'single')             , ...
+                       'biasesLearningRate' , 1                                    , ...
+                       'biasesWeightDecay'  , 0                                    , ...
+                       'filters'            , sparse_initialization([1 1 784 1000]), ...
+                       'filtersLearningRate', 1                                    , ...
+                       'filtersWeightDecay' , 1                                    , ...
+                       'name'               , 'encoder_1'                          , ...
+                       'pad'                , [0 0 0 0]                            , ...
+                       'stride'             , [1 1]                                , ...
+                       'type'               , 'conv'                               );
+
+net.layers{2} = struct('name', 'encoder_1_sigmoid', ...
+                       'type', 'sigmoid'          );
+
+% Layer 2
+
+net.layers{3} = struct('biases'             , zeros(1, 500, 'single')              , ...
+                       'biasesLearningRate' , 1                                    , ...
+                       'biasesWeightDecay'  , 0                                    , ...
+                       'filters'            , sparse_initialization([1 1 1000 500]), ...
+                       'filtersLearningRate', 1                                    , ...
+                       'filtersWeightDecay' , 1                                    , ...
+                       'name'               , 'encoder_2'                          , ...
+                       'pad'                , [0 0 0 0]                            , ...
+                       'stride'             , [1 1]                                , ...
+                       'type'               , 'conv'                               );
+
+net.layers{4} = struct('name', 'encoder_2_sigmoid', ...
+                       'type', 'sigmoid'          );
+
+% Layer 3
+
+net.layers{5} = struct('biases'             , zeros(1, 250, 'single')             , ...
+                       'biasesLearningRate' , 1                                   , ...
+                       'biasesWeightDecay'  , 0                                   , ...
+                       'filters'            , sparse_initialization([1 1 500 250]), ...
+                       'filtersLearningRate', 1                                   , ...
+                       'filtersWeightDecay' , 1                                   , ...
+                       'name'               , 'encoder_3'                         , ...
+                       'pad'                , [0 0 0 0]                           , ...
+                       'stride'             , [1 1]                               , ...
+                       'type'               , 'conv'                              );
+
+net.layers{6} = struct('name', 'encoder_3_sigmoid', ...
+                       'type', 'sigmoid'          );
+
+% Layer 4
+
+net.layers{5} = struct('biases'             , zeros(1, 30, 'single')             , ...
+                       'biasesLearningRate' , 1                                  , ...
+                       'biasesWeightDecay'  , 0                                  , ...
+                       'filters'            , sparse_initialization([1 1 250 30]), ...
+                       'filtersLearningRate', 1                                  , ...
+                       'filtersWeightDecay' , 1                                  , ...
+                       'name'               , 'code'                             , ...
+                       'pad'                , [0 0 0 0]                          , ...
+                       'stride'             , [1 1]                              , ...
+                       'type'               , 'conv'                             );
+
+% Layer 5
+
+net.layers{6} = struct('biases'             , zeros(1, 250, 'single')            , ...
+                       'biasesLearningRate' , 1                                  , ...
+                       'biasesWeightDecay'  , 0                                  , ...
+                       'filters'            , sparse_initialization([1 1 30 250]), ...
+                       'filtersLearningRate', 1                                  , ...
+                       'filtersWeightDecay' , 1                                  , ...
+                       'name'               , 'decoder_3'                        , ...
+                       'pad'                , [0 0 0 0]                          , ...
+                       'stride'             , [1 1]                              , ...
+                       'type'               , 'conv'                             );
+
+net.layers{7} = struct('name', 'decoder_3_sigmoid', ...
+                       'type', 'sigmoid'          );
+
+% Layer 6
+
+net.layers{8} = struct('biases'             , zeros(1, 500, 'single')             , ...
+                       'biasesLearningRate' , 1                                   , ...
+                       'biasesWeightDecay'  , 0                                   , ...
+                       'filters'            , sparse_initialization([1 1 250 500]), ...
+                       'filtersLearningRate', 1                                   , ...
+                       'filtersWeightDecay' , 1                                   , ...
+                       'name'               , 'decoder_2'                         , ...
+                       'pad'                , [0 0 0 0]                           , ...
+                       'stride'             , [1 1]                               , ...
+                       'type'               , 'conv'                              );
+
+net.layers{9} = struct('name', 'decoder_2_sigmoid', ...
+                       'type', 'sigmoid'          );
+
+% Layer 7
+
+net.layers{10} = struct('biases'             , zeros(1, 1000, 'single')             , ...
+                        'biasesLearningRate' , 1                                    , ...
+                        'biasesWeightDecay'  , 0                                    , ...
+                        'filters'            , sparse_initialization([1 1 500 1000]), ...
+                        'filtersLearningRate', 1                                    , ...
+                        'filtersWeightDecay' , 1                                    , ...
+                        'name'               , 'decoder_1'                          , ...
+                        'pad'                , [0 0 0 0]                            , ...
+                        'stride'             , [1 1]                                , ...
+                        'type'               , 'conv'                               );
+
+net.layers{11} = struct('name', 'decoder_1_sigmoid', ...
+                        'type', 'sigmoid'          );
+
+% Layer 8
+
+net.layers{12} = struct('biases'             , zeros(1, 784, 'single')              , ...
+                        'biasesLearningRate' , 1                                    , ...
+                        'biasesWeightDecay'  , 0                                    , ...
+                        'filters'            , sparse_initialization([1 1 1000 784]), ...
+                        'filtersLearningRate', 1                                    , ...
+                        'filtersWeightDecay' , 1                                    , ...
+                        'name'               , 'data_hat'                           , ...
+                        'pad'                , [0 0 0 0]                            , ...
+                        'stride'             , [1 1]                                , ...
+                        'type'               , 'conv'                               );
+
+net.layers{13} = struct('type', 'sigmoidcrossentropyloss');
+
+vl_simplenn_display(net);
+
+end
+
+% -------------------------------------------------------------------------
+function filters = sparse_initialization(d)
+% -------------------------------------------------------------------------
+
+filters = zeros(d, 'single');
+
+for index = 1 : d(4)
+    
+    p = randperm(d(3), 15);
+    
+    filters(1, 1, p, index) = randn(1, 1, 15, 1);
+    
+end
+
+end
+
+% -------------------------------------------------------------------------
+function opts = getMnistAutoencoderOpts
+% -------------------------------------------------------------------------
+
+opts.batchSize       = 100;
+opts.conserveMemory  = false;
+opts.continue        = false;
+opts.dataDir         = fullfile('data','mnist');
+opts.display         = 10;
+opts.delta           = 1e-8;
+opts.errorType       = 'euclideanloss';
+opts.expDir          = fullfile('data','mnistAutoencoder');
+opts.imdbPath        = fullfile(opts.expDir, 'imdb.mat');
+% opts.learningRate    = 1e-4;
+opts.learningRate    = 1e-2;
+% opts.momentum        = 0.9;
+% opts.numEpochs       = 6667; % 6667 epochs is ~4000000 iterations.
+opts.numEpochs       = 108; % 108 epochs is ~65000 iterations.
+opts.plotDiagnostics = false;
+opts.prefetch        = false;
+opts.snapshot        = 10;
+opts.sync            = true;
+opts.test_interval   = 10;
+opts.train           = [];
+opts.useGpu          = true;
+opts.val             = [];
+opts.weightDecay     = 5e-4;
+
+end
+
+% -------------------------------------------------------------------------
+function imdb = getMnistAutoencoderImdb(opts)
+% -------------------------------------------------------------------------
+% Preapre the imdb structure, returns image data with mean image subtracted
+files = {'train-images-idx3-ubyte', ...
+         'train-labels-idx1-ubyte', ...
+         't10k-images-idx3-ubyte', ...
+         't10k-labels-idx1-ubyte'} ;
+
+if ~exist(opts.dataDir, 'dir')
+  mkdir(opts.dataDir) ;
+end
+
+for i=1:4
+  if ~exist(fullfile(opts.dataDir, files{i}), 'file')
+    url = sprintf('http://yann.lecun.com/exdb/mnist/%s.gz',files{i}) ;
+    fprintf('downloading %s\n', url) ;
+    gunzip(url, opts.dataDir) ;
+  end
+end
+
+f=fopen(fullfile(opts.dataDir, 'train-images-idx3-ubyte'),'r') ;
+x1=fread(f,inf,'uint8');
+fclose(f) ;
+x1=permute(reshape(x1(17:end),28,28,60e3),[2 1 3]) ;
+
+f=fopen(fullfile(opts.dataDir, 't10k-images-idx3-ubyte'),'r') ;
+x2=fread(f,inf,'uint8');
+fclose(f) ;
+x2=permute(reshape(x2(17:end),28,28,10e3),[2 1 3]) ;
+
+f=fopen(fullfile(opts.dataDir, 'train-labels-idx1-ubyte'),'r') ;
+y1=fread(f,inf,'uint8');
+fclose(f) ;
+y1=double(y1(9:end)')+1 ;
+
+f=fopen(fullfile(opts.dataDir, 't10k-labels-idx1-ubyte'),'r') ;
+y2=fread(f,inf,'uint8');
+fclose(f) ;
+y2=double(y2(9:end)')+1 ;
+
+set = [ones(1,numel(y1)) 2*ones(1,numel(y2))];
+% data = single(reshape(cat(3, x1, x2),28,28,1,[]));
+% dataMean = mean(data(:,:,:,set == 1), 4);
+% data = bsxfun(@minus, data, dataMean) ;
+data = single(reshape(cat(3, x1, x2), 1, 1, 784, []));
+data = data - min(data(:)); data = data / max(data(:));
+
+imdb.images.data = data ;
+% imdb.images.data_mean = dataMean;
+imdb.images.labels = cat(2, y1, y2) ;
+imdb.images.set = set ;
+imdb.meta.sets = {'train', 'val', 'test'} ;
+imdb.meta.classes = arrayfun(@(x)sprintf('%d',x),0:9,'uniformoutput',false) ;
+
+end
+
+% -------------------------------------------------------------------------
+function [im, labels] = getMnistAutoencoderBatch(imdb, batch)
+% -------------------------------------------------------------------------
+
+im     = imdb.images.data(:, :, :, batch);
+labels = im;
+
+end
+
diff --git a/examples/mnistAutoencoder/cnn_mnist_autoencoder_test_demo.m b/examples/mnistAutoencoder/cnn_mnist_autoencoder_test_demo.m
new file mode 100644
index 00000000..77735a41
--- /dev/null
+++ b/examples/mnistAutoencoder/cnn_mnist_autoencoder_test_demo.m
@@ -0,0 +1,76 @@
+%%
+
+close all;
+clear all;
+clc;
+
+%%
+
+run('~/GitHub/umuguc/matconvnet/matlab/vl_setupnn');
+
+%%
+
+load('net.mat');
+load(opts.imdbPath);
+
+%%
+
+N = [5 2];
+
+Y = zeros(N(1) * N(2), 1);
+
+h = figure;
+
+for index = 1 : N(1) * N(2)
+    
+    im = imdb.images.data(:, :, :, end - index + 1);
+    
+    if opts.useGpu
+        
+        im = gpuArray(im);
+        
+    end
+    
+    subplot(N(1), 2 * N(2), 2 * index - 1);
+    
+    imagesc(reshape(im, 28, 28));
+    
+    axis off;
+    axis square;
+    
+    drawnow;
+    
+    net.layers{end}.class = im;
+    
+    res = vl_simplenn(net, im, [], [], 'disableDropout', true);
+    
+    subplot(N(1), 2 * N(2), 2 * index);
+    
+    imagesc(reshape(res(end - 1).x, 28, 28));
+    
+    axis off;
+    axis square;
+    
+    drawnow;
+    
+    Y(index) = gather(res(end).x);
+    
+end
+
+disp(['Euclidean loss: ' num2str(mean(Y))]);
+
+%%
+
+% Test net:
+
+%      layer|      1|      2|      3|      4|      5|      6|      7|      8|      9|     10|     11|     12|     13|     14|
+%       type|    cnv|sigmoid|    cnv|sigmoid|    cnv|    cnv|sigmoid|    cnv|sigmoid|    cnv|sigmoid|    cnv|sigmoid|euclideanloss|
+%    support|    1x1|    1x1|    1x1|    1x1|    1x1|    1x1|    1x1|    1x1|    1x1|    1x1|    1x1|    1x1|    1x1|    1x1|
+%     stride|      1|      1|      1|      1|      1|      1|      1|      1|      1|      1|      1|      1|      1|      1|
+%        pad|      0|      0|      0|      0|      0|      0|      0|      0|      0|      0|      0|      0|      0|      0|
+%    out dim|   1000|   1000|    500|    500|     30|    250|    250|    500|    500|   1000|   1000|    784|    784|    784|
+%   filt dim|    784|    n/a|   1000|    n/a|    250|     30|    n/a|    250|    n/a|    500|    n/a|   1000|    n/a|    n/a|
+% rec. field|      1|      1|      1|      1|      1|      1|      1|      1|      1|      1|      1|      1|      1|      1|
+% c/g net KB| 3066/0|    0/0| 1955/0|    0/0|   29/0|   30/0|    0/0|  490/0|    0/0| 1957/0|    0/0| 3066/0|    0/0|    0/0|
+% total network CPU/GPU memory: 10.3/0 MB
+
diff --git a/examples/mnistAutoencoder/cnn_mnist_autoencoder_training_demo.m b/examples/mnistAutoencoder/cnn_mnist_autoencoder_training_demo.m
new file mode 100644
index 00000000..baa3e709
--- /dev/null
+++ b/examples/mnistAutoencoder/cnn_mnist_autoencoder_training_demo.m
@@ -0,0 +1,33 @@
+%%
+
+close all;
+clear all;
+clc;
+
+%%
+
+run('~/GitHub/umuguc/matconvnet/matlab/vl_setupnn');
+
+%%
+
+rng(0);
+
+[net, opts, imdb, info] = cnn_mnist_autoencoder;
+
+save('net.mat', 'net', 'opts', 'info');
+
+%%
+
+% Training net:
+
+%      layer|      1|      2|      3|      4|      5|      6|      7|      8|      9|     10|     11|     12|     13|
+%       type|    cnv|sigmoid|    cnv|sigmoid|    cnv|    cnv|sigmoid|    cnv|sigmoid|    cnv|sigmoid|    cnv|sigmoidcrossentropyloss|
+%    support|    1x1|    1x1|    1x1|    1x1|    1x1|    1x1|    1x1|    1x1|    1x1|    1x1|    1x1|    1x1|    1x1|
+%     stride|      1|      1|      1|      1|      1|      1|      1|      1|      1|      1|      1|      1|      1|
+%        pad|      0|      0|      0|      0|      0|      0|      0|      0|      0|      0|      0|      0|      0|
+%    out dim|   1000|   1000|    500|    500|     30|    250|    250|    500|    500|   1000|   1000|    784|    784|
+%   filt dim|    784|    n/a|   1000|    n/a|    250|     30|    n/a|    250|    n/a|    500|    n/a|   1000|    n/a|
+% rec. field|      1|      1|      1|      1|      1|      1|      1|      1|      1|      1|      1|      1|      1|
+% c/g net KB| 3066/0|    0/0| 1955/0|    0/0|   29/0|   30/0|    0/0|  490/0|    0/0| 1957/0|    0/0| 3066/0|    0/0|
+% total network CPU/GPU memory: 10.3/0 MB
+
diff --git a/examples/mnistAutoencoder/cnn_train.m b/examples/mnistAutoencoder/cnn_train.m
new file mode 100644
index 00000000..a92e794a
--- /dev/null
+++ b/examples/mnistAutoencoder/cnn_train.m
@@ -0,0 +1,320 @@
+function [net, info] = cnn_train(net, imdb, getBatch, varargin)
+% CNN_TRAIN   Demonstrates training a CNN
+%    CNN_TRAIN() is an example learner implementing stochastic gradient
+%    descent with momentum to train a CNN for image classification.
+%    It can be used with different datasets by providing a suitable
+%    getBatch function.
+
+opts.train = [] ;
+opts.val = [] ;
+opts.numEpochs = 300 ;
+opts.batchSize = 256 ;
+opts.useGpu = false ;
+opts.learningRate = 0.001 ;
+opts.continue = false ;
+opts.expDir = fullfile('data','exp') ;
+opts.conserveMemory = false ;
+opts.sync = true ;
+opts.prefetch = false ;
+opts.weightDecay = 0.0005 ;
+opts.momentum = 0.9 ;
+opts.errorType = 'multiclass' ;
+opts.plotDiagnostics = false ;
+opts.display = 1;
+opts.snapshot = 1;
+opts.test_interval = 1;
+opts = vl_argparse(opts, varargin) ;
+
+if ~exist(opts.expDir, 'dir'), mkdir(opts.expDir) ; end
+if isempty(opts.train), opts.train = find(imdb.images.set==1) ; end
+if isempty(opts.val), opts.val = find(imdb.images.set==2) ; end
+if isnan(opts.train), opts.train = [] ; end
+
+% -------------------------------------------------------------------------
+%                                                    Network initialization
+% -------------------------------------------------------------------------
+
+for i=1:numel(net.layers)
+  if ~strcmp(net.layers{i}.type,'conv'), continue; end
+  net.layers{i}.filtersMomentum = zeros(size(net.layers{i}.filters), ...
+    class(net.layers{i}.filters)) ;
+  net.layers{i}.biasesMomentum = zeros(size(net.layers{i}.biases), ...
+    class(net.layers{i}.biases)) ; %#ok<*ZEROLIKE>
+  if ~isfield(net.layers{i}, 'filtersLearningRate')
+    net.layers{i}.filtersLearningRate = 1 ;
+  end
+  if ~isfield(net.layers{i}, 'biasesLearningRate')
+    net.layers{i}.biasesLearningRate = 1 ;
+  end
+  if ~isfield(net.layers{i}, 'filtersWeightDecay')
+    net.layers{i}.filtersWeightDecay = 1 ;
+  end
+  if ~isfield(net.layers{i}, 'biasesWeightDecay')
+    net.layers{i}.biasesWeightDecay = 1 ;
+  end
+end
+
+if opts.useGpu
+  net = vl_simplenn_move(net, 'gpu') ;
+  for i=1:numel(net.layers)
+    if ~strcmp(net.layers{i}.type,'conv'), continue; end
+    net.layers{i}.filtersMomentum = gpuArray(net.layers{i}.filtersMomentum) ;
+    net.layers{i}.biasesMomentum = gpuArray(net.layers{i}.biasesMomentum) ;
+  end
+end
+
+% -------------------------------------------------------------------------
+%                                                        Train and validate
+% -------------------------------------------------------------------------
+
+rng(0) ;
+
+if opts.useGpu
+  one = gpuArray(single(1)) ;
+else
+  one = single(1) ;
+end
+
+info.train.objective = [] ;
+info.train.error = [] ;
+info.train.topFiveError = [] ;
+info.train.speed = [] ;
+info.val.objective = [] ;
+info.val.error = [] ;
+info.val.topFiveError = [] ;
+info.val.speed = [] ;
+
+lr = 0 ;
+res = [] ;
+for epoch=1:opts.numEpochs
+  prevLr = lr ;
+  lr = opts.learningRate(min(epoch, numel(opts.learningRate))) ;
+
+  % fast-forward to where we stopped
+  modelPath = @(ep) fullfile(opts.expDir, sprintf('net-epoch-%d.mat', ep));
+  modelFigPath = fullfile(opts.expDir, 'net-train.pdf') ;
+  if opts.continue
+    if exist(modelPath(epoch),'file')
+      if epoch == opts.numEpochs
+        load(modelPath(epoch), 'net', 'info') ;
+      end
+      continue ;
+    end
+    if epoch > 1
+      fprintf('resuming by loading epoch %d\n', epoch-1) ;
+      load(modelPath(epoch-1), 'net', 'info') ;
+    end
+  end
+
+  train = opts.train(randperm(numel(opts.train))) ;
+  val = opts.val ;
+
+  info.train.objective(end+1) = 0 ;
+  info.train.error(end+1) = 0 ;
+  info.train.topFiveError(end+1) = 0 ;
+  info.train.speed(end+1) = 0 ;
+  info.val.objective(end+1) = 0 ;
+  info.val.error(end+1) = 0 ;
+  info.val.topFiveError(end+1) = 0 ;
+  info.val.speed(end+1) = 0 ;
+
+  % reset momentum if needed
+  if prevLr ~= lr
+    fprintf('learning rate changed (%f --> %f): resetting momentum\n', prevLr, lr) ;
+    for l=1:numel(net.layers)
+      if ~strcmp(net.layers{l}.type, 'conv'), continue ; end
+      net.layers{l}.filtersMomentum = 0 * net.layers{l}.filtersMomentum ;
+      net.layers{l}.biasesMomentum = 0 * net.layers{l}.biasesMomentum ;
+    end
+  end
+
+  for t=1:opts.batchSize:numel(train)
+    % get next image batch and labels
+    batch = train(t:min(t+opts.batchSize-1, numel(train))) ;
+    batch_time = tic ;
+    fprintf('training: epoch %02d: processing batch %3d of %3d ...', epoch, ...
+            fix(t/opts.batchSize)+1, ceil(numel(train)/opts.batchSize)) ;
+    [im, labels] = getBatch(imdb, batch) ;
+    if opts.prefetch
+      nextBatch = train(t+opts.batchSize:min(t+2*opts.batchSize-1, numel(train))) ;
+      getBatch(imdb, nextBatch) ;
+    end
+    if opts.useGpu
+      im = gpuArray(im) ;
+    end
+
+    % backprop
+    net.layers{end}.class = labels ;
+    res = vl_simplenn(net, im, one, res, ...
+      'conserveMemory', opts.conserveMemory, ...
+      'sync', opts.sync) ;
+
+    % gradient step
+    for l=1:numel(net.layers)
+      if ~strcmp(net.layers{l}.type, 'conv'), continue ; end
+
+      net.layers{l}.filtersMomentum = ...
+        opts.momentum * net.layers{l}.filtersMomentum ...
+          - (lr * net.layers{l}.filtersLearningRate) * ...
+          (opts.weightDecay * net.layers{l}.filtersWeightDecay) * net.layers{l}.filters ...
+          - (lr * net.layers{l}.filtersLearningRate) / numel(batch) * res(l).dzdw{1} ;
+
+      net.layers{l}.biasesMomentum = ...
+        opts.momentum * net.layers{l}.biasesMomentum ...
+          - (lr * net.layers{l}.biasesLearningRate) * ....
+          (opts.weightDecay * net.layers{l}.biasesWeightDecay) * net.layers{l}.biases ...
+          - (lr * net.layers{l}.biasesLearningRate) / numel(batch) * res(l).dzdw{2} ;
+
+      net.layers{l}.filters = net.layers{l}.filters + net.layers{l}.filtersMomentum ;
+      net.layers{l}.biases = net.layers{l}.biases + net.layers{l}.biasesMomentum ;
+    end
+
+    % print information
+    batch_time = toc(batch_time) ;
+    speed = numel(batch)/batch_time ;
+    info.train = updateError(opts, info.train, net, res, batch_time) ;
+
+    fprintf(' %.2f s (%.1f images/s)', batch_time, speed) ;
+    n = t + numel(batch) - 1 ;
+    switch opts.errorType
+      case 'multiclass'
+        fprintf(' err %.1f err5 %.1f', ...
+          info.train.error(end)/n*100, info.train.topFiveError(end)/n*100) ;
+        fprintf('\n') ;
+      case 'binary'
+        fprintf(' err %.1f', ...
+          info.train.error(end)/n*100) ;
+        fprintf('\n') ;
+      case 'euclideanloss'
+        fprintf(' err %.1f', info.train.error(end) / n);
+        fprintf('\n') ;
+    end
+
+    % debug info
+    if opts.plotDiagnostics
+      figure(2) ; vl_simplenn_diagnose(net,res) ; drawnow ;
+    end
+  end % next batch
+
+  % evaluation on validation set
+  if epoch == 1 || rem(epoch, opts.test_interval) == 0 || epoch == opts.numEpochs
+  for t=1:opts.batchSize:numel(val)
+    batch_time = tic ;
+    batch = val(t:min(t+opts.batchSize-1, numel(val))) ;
+    fprintf('validation: epoch %02d: processing batch %3d of %3d ...', epoch, ...
+            fix(t/opts.batchSize)+1, ceil(numel(val)/opts.batchSize)) ;
+    [im, labels] = getBatch(imdb, batch) ;
+    if opts.prefetch
+      nextBatch = val(t+opts.batchSize:min(t+2*opts.batchSize-1, numel(val))) ;
+      getBatch(imdb, nextBatch) ;
+    end
+    if opts.useGpu
+      im = gpuArray(im) ;
+    end
+
+    net.layers{end}.class = labels ;
+    res = vl_simplenn(net, im, [], res, ...
+      'disableDropout', true, ...
+      'conserveMemory', opts.conserveMemory, ...
+      'sync', opts.sync) ;
+
+    % print information
+    batch_time = toc(batch_time) ;
+    speed = numel(batch)/batch_time ;
+    info.val = updateError(opts, info.val, net, res, batch_time) ;
+
+    fprintf(' %.2f s (%.1f images/s)', batch_time, speed) ;
+    n = t + numel(batch) - 1 ;
+    switch opts.errorType
+      case 'multiclass'
+        fprintf(' err %.1f err5 %.1f', ...
+          info.val.error(end)/n*100, info.val.topFiveError(end)/n*100) ;
+        fprintf('\n') ;
+      case 'binary'
+        fprintf(' err %.1f', ...
+          info.val.error(end)/n*100) ;
+        fprintf('\n') ;
+      case 'euclideanloss'
+        fprintf(' err %.1f', info.val.error(end) / n);
+        fprintf('\n') ;
+    end
+  end
+  end
+
+  % save
+  info.train.objective(end) = info.train.objective(end) / numel(train) ;
+  info.train.error(end) = info.train.error(end) / numel(train)  ;
+  info.train.topFiveError(end) = info.train.topFiveError(end) / numel(train) ;
+  info.train.speed(end) = numel(train) / info.train.speed(end) ;
+  info.val.objective(end) = info.val.objective(end) / numel(val) ;
+  info.val.error(end) = info.val.error(end) / numel(val) ;
+  info.val.topFiveError(end) = info.val.topFiveError(end) / numel(val) ;
+  info.val.speed(end) = numel(val) / info.val.speed(end) ;
+  if epoch == 1 || rem(epoch, opts.snapshot) == 0 || epoch == opts.numEpochs
+  save(modelPath(epoch), 'net', 'info') ;
+  end
+
+  if epoch == 1 || rem(epoch, opts.display) == 0 || epoch == opts.numEpochs
+  figure(1) ; clf ;
+  subplot(1,2,1) ;
+  semilogy(1:epoch, info.train.objective, 'k') ; hold on ;
+  semilogy([1 opts.test_interval : opts.test_interval : epoch epoch], info.val.objective([1 opts.test_interval : opts.test_interval : epoch epoch]), 'b') ;
+  xlabel('training epoch') ; ylabel('energy') ;
+  grid on ;
+  h=legend('train', 'val') ;
+  set(h,'color','none');
+  title('objective') ;
+  subplot(1,2,2) ;
+  switch opts.errorType
+    case 'multiclass'
+      plot(1:epoch, info.train.error, 'k') ; hold on ;
+      plot(1:epoch, info.train.topFiveError, 'k--') ;
+      plot([1 opts.test_interval : opts.test_interval : epoch epoch], info.val.error([1 opts.test_interval : opts.test_interval : epoch epoch]), 'b') ;
+      plot([1 opts.test_interval : opts.test_interval : epoch epoch], info.val.topFiveError([1 opts.test_interval : opts.test_interval : epoch epoch]), 'b--') ;
+      h=legend('train','train-5','val','val-5') ;
+    case 'binary'
+      plot(1:epoch, info.train.error, 'k') ; hold on ;
+      plot([1 opts.test_interval : opts.test_interval : epoch epoch], info.val.error([1 opts.test_interval : opts.test_interval : epoch epoch]), 'b') ;
+      h=legend('train','val') ;
+    case 'euclideanloss'
+      plot(1 : epoch, info.train.error, 'k'); hold on;
+      plot([1 opts.test_interval : opts.test_interval : epoch epoch], info.val.error([1 opts.test_interval : opts.test_interval : epoch epoch]), 'b') ;
+      h = legend('train', 'val') ;
+  end
+  grid on ;
+  xlabel('training epoch') ; ylabel('error') ;
+  set(h,'color','none') ;
+  title('error') ;
+  drawnow ;
+  print(1, modelFigPath, '-dpdf') ;
+  end
+end
+
+% -------------------------------------------------------------------------
+function info = updateError(opts, info, net, res, speed)
+% -------------------------------------------------------------------------
+predictions = gather(res(end-1).x) ;
+sz = size(predictions) ;
+n = prod(sz(1:2)) ;
+
+labels = net.layers{end}.class ;
+info.objective(end) = info.objective(end) + sum(double(gather(res(end).x))) ;
+info.speed(end) = info.speed(end) + speed ;
+switch opts.errorType
+  case 'multiclass'
+    [~,predictions] = sort(predictions, 3, 'descend') ;
+    error = ~bsxfun(@eq, predictions, reshape(labels, 1, 1, 1, [])) ;
+    info.error(end) = info.error(end) +....
+      sum(sum(sum(error(:,:,1,:))))/n ;
+    info.topFiveError(end) = info.topFiveError(end) + ...
+      sum(sum(sum(min(error(:,:,1:5,:),[],3))))/n ;
+  case 'binary'
+    error = bsxfun(@times, predictions, labels) < 0 ;
+    info.error(end) = info.error(end) + sum(error(:))/n ;
+  case 'euclideanloss'
+    error = euclideanloss(sigmoid(predictions), labels);
+    info.error(end) = info.error(end) + error;
+end
+
+
+
diff --git a/examples/mnistAutoencoder/cnn_train_adagrad.m b/examples/mnistAutoencoder/cnn_train_adagrad.m
new file mode 100644
index 00000000..eb8e1b5e
--- /dev/null
+++ b/examples/mnistAutoencoder/cnn_train_adagrad.m
@@ -0,0 +1,318 @@
+function [net, info] = cnn_train_adagrad(net, imdb, getBatch, varargin)
+% CNN_TRAIN   Demonstrates training a CNN
+%    CNN_TRAIN() is an example learner implementing stochastic gradient
+%    descent with momentum to train a CNN for image classification.
+%    It can be used with different datasets by providing a suitable
+%    getBatch function.
+
+opts.train = [] ;
+opts.val = [] ;
+opts.numEpochs = 300 ;
+opts.batchSize = 256 ;
+opts.useGpu = false ;
+opts.learningRate = 0.001 ;
+opts.continue = false ;
+opts.expDir = fullfile('data','exp') ;
+opts.conserveMemory = false ;
+opts.sync = true ;
+opts.prefetch = false ;
+opts.weightDecay = 0.0005 ;
+opts.errorType = 'multiclass' ;
+opts.plotDiagnostics = false ;
+opts.delta = 1e-8;
+opts.display = 1;
+opts.snapshot = 1;
+opts.test_interval = 1;
+opts = vl_argparse(opts, varargin) ;
+
+if ~exist(opts.expDir, 'dir'), mkdir(opts.expDir) ; end
+if isempty(opts.train), opts.train = find(imdb.images.set==1) ; end
+if isempty(opts.val), opts.val = find(imdb.images.set==2) ; end
+if isnan(opts.train), opts.train = [] ; end
+
+% -------------------------------------------------------------------------
+%                                                    Network initialization
+% -------------------------------------------------------------------------
+
+for i=1:numel(net.layers)
+  if ~strcmp(net.layers{i}.type,'conv'), continue; end
+  net.layers{i}.filtersMomentum = zeros(size(net.layers{i}.filters), ...
+    class(net.layers{i}.filters)) ;
+  net.layers{i}.biasesMomentum = zeros(size(net.layers{i}.biases), ...
+    class(net.layers{i}.biases)) ; %#ok<*ZEROLIKE>
+  if ~isfield(net.layers{i}, 'filtersLearningRate')
+    net.layers{i}.filtersLearningRate = 1 ;
+  end
+  if ~isfield(net.layers{i}, 'biasesLearningRate')
+    net.layers{i}.biasesLearningRate = 1 ;
+  end
+  if ~isfield(net.layers{i}, 'filtersWeightDecay')
+    net.layers{i}.filtersWeightDecay = 1 ;
+  end
+  if ~isfield(net.layers{i}, 'biasesWeightDecay')
+    net.layers{i}.biasesWeightDecay = 1 ;
+  end
+end
+
+if opts.useGpu
+  net = vl_simplenn_move(net, 'gpu') ;
+  for i=1:numel(net.layers)
+    if ~strcmp(net.layers{i}.type,'conv'), continue; end
+    net.layers{i}.filtersMomentum = gpuArray(net.layers{i}.filtersMomentum) ;
+    net.layers{i}.biasesMomentum = gpuArray(net.layers{i}.biasesMomentum) ;
+  end
+end
+
+G_f = cell(numel(net.layers), 1);
+G_b = cell(numel(net.layers), 1);
+
+for l=1:numel(net.layers)
+    
+  if ~strcmp(net.layers{l}.type, 'conv'), continue ; end
+  
+  G_f{l} = zeros(size(net.layers{l}.filters), 'single');
+  G_b{l} = zeros(size(net.layers{l}.biases), 'single');
+  
+end
+
+% -------------------------------------------------------------------------
+%                                                        Train and validate
+% -------------------------------------------------------------------------
+
+rng(0) ;
+
+if opts.useGpu
+  one = gpuArray(single(1)) ;
+else
+  one = single(1) ;
+end
+
+info.train.objective = [] ;
+info.train.error = [] ;
+info.train.topFiveError = [] ;
+info.train.speed = [] ;
+info.val.objective = [] ;
+info.val.error = [] ;
+info.val.topFiveError = [] ;
+info.val.speed = [] ;
+
+lr = opts.learningRate ;
+res = [] ;
+for epoch=1:opts.numEpochs
+
+  % fast-forward to where we stopped
+  modelPath = @(ep) fullfile(opts.expDir, sprintf('net-epoch-%d.mat', ep));
+  modelFigPath = fullfile(opts.expDir, 'net-train.pdf') ;
+  if opts.continue
+    if exist(modelPath(epoch),'file')
+      if epoch == opts.numEpochs
+        load(modelPath(epoch), 'net', 'info') ;
+      end
+      continue ;
+    end
+    if epoch > 1
+      fprintf('resuming by loading epoch %d\n', epoch-1) ;
+      load(modelPath(epoch-1), 'net', 'info') ;
+    end
+  end
+
+  train = opts.train(randperm(numel(opts.train))) ;
+  val = opts.val ;
+
+  info.train.objective(end+1) = 0 ;
+  info.train.error(end+1) = 0 ;
+  info.train.topFiveError(end+1) = 0 ;
+  info.train.speed(end+1) = 0 ;
+  info.val.objective(end+1) = 0 ;
+  info.val.error(end+1) = 0 ;
+  info.val.topFiveError(end+1) = 0 ;
+  info.val.speed(end+1) = 0 ;
+
+  for t=1:opts.batchSize:numel(train)
+    % get next image batch and labels
+    batch = train(t:min(t+opts.batchSize-1, numel(train))) ;
+    batch_time = tic ;
+    fprintf('training: epoch %02d: processing batch %3d of %3d ...', epoch, ...
+            fix(t/opts.batchSize)+1, ceil(numel(train)/opts.batchSize)) ;
+    [im, labels] = getBatch(imdb, batch) ;
+    if opts.prefetch
+      nextBatch = train(t+opts.batchSize:min(t+2*opts.batchSize-1, numel(train))) ;
+      getBatch(imdb, nextBatch) ;
+    end
+    if opts.useGpu
+      im = gpuArray(im) ;
+    end
+
+    % backprop
+    net.layers{end}.class = labels ;
+    res = vl_simplenn(net, im, one, res, ...
+      'conserveMemory', opts.conserveMemory, ...
+      'sync', opts.sync) ;
+
+    % gradient step
+    for l=1:numel(net.layers)
+      if ~strcmp(net.layers{l}.type, 'conv'), continue ; end
+      
+      g_f = (net.layers{l}.filtersLearningRate) * ...
+            (opts.weightDecay * net.layers{l}.filtersWeightDecay) * net.layers{l}.filters + ...
+            (net.layers{l}.filtersLearningRate) / numel(batch) * res(l).dzdw{1};
+      g_b = (net.layers{l}.biasesLearningRate) * ...
+            (opts.weightDecay * net.layers{l}.biasesWeightDecay) * net.layers{l}.biases + ...
+            (net.layers{l}.biasesLearningRate) / numel(batch) * res(l).dzdw{2};
+      
+      G_f{l} = G_f{l} + g_f .^ 2;
+      G_b{l} = G_b{l} + g_b .^ 2;
+      
+      net.layers{l}.filters = net.layers{l}.filters - lr ./ (opts.delta + sqrt(G_f{l})) .* g_f;
+      net.layers{l}.biases  = net.layers{l}.biases - lr ./ (opts.delta + sqrt(G_b{l})) .* g_b;
+    end
+
+    % print information
+    batch_time = toc(batch_time) ;
+    speed = numel(batch)/batch_time ;
+    info.train = updateError(opts, info.train, net, res, batch_time) ;
+
+    fprintf(' %.2f s (%.1f images/s)', batch_time, speed) ;
+    n = t + numel(batch) - 1 ;
+    switch opts.errorType
+      case 'multiclass'
+        fprintf(' err %.1f err5 %.1f', ...
+          info.train.error(end)/n*100, info.train.topFiveError(end)/n*100) ;
+        fprintf('\n') ;
+      case 'binary'
+        fprintf(' err %.1f', ...
+          info.train.error(end)/n*100) ;
+        fprintf('\n') ;
+      case 'euclideanloss'
+        fprintf(' err %.1f', info.train.error(end) / n);
+        fprintf('\n') ;
+    end
+
+    % debug info
+    if opts.plotDiagnostics
+      figure(2) ; vl_simplenn_diagnose(net,res) ; drawnow ;
+    end
+  end % next batch
+
+  % evaluation on validation set
+  if epoch == 1 || rem(epoch, opts.test_interval) == 0 || epoch == opts.numEpochs
+  for t=1:opts.batchSize:numel(val)
+    batch_time = tic ;
+    batch = val(t:min(t+opts.batchSize-1, numel(val))) ;
+    fprintf('validation: epoch %02d: processing batch %3d of %3d ...', epoch, ...
+            fix(t/opts.batchSize)+1, ceil(numel(val)/opts.batchSize)) ;
+    [im, labels] = getBatch(imdb, batch) ;
+    if opts.prefetch
+      nextBatch = val(t+opts.batchSize:min(t+2*opts.batchSize-1, numel(val))) ;
+      getBatch(imdb, nextBatch) ;
+    end
+    if opts.useGpu
+      im = gpuArray(im) ;
+    end
+
+    net.layers{end}.class = labels ;
+    res = vl_simplenn(net, im, [], res, ...
+      'disableDropout', true, ...
+      'conserveMemory', opts.conserveMemory, ...
+      'sync', opts.sync) ;
+
+    % print information
+    batch_time = toc(batch_time) ;
+    speed = numel(batch)/batch_time ;
+    info.val = updateError(opts, info.val, net, res, batch_time) ;
+
+    fprintf(' %.2f s (%.1f images/s)', batch_time, speed) ;
+    n = t + numel(batch) - 1 ;
+    switch opts.errorType
+      case 'multiclass'
+        fprintf(' err %.1f err5 %.1f', ...
+          info.val.error(end)/n*100, info.val.topFiveError(end)/n*100) ;
+        fprintf('\n') ;
+      case 'binary'
+        fprintf(' err %.1f', ...
+          info.val.error(end)/n*100) ;
+        fprintf('\n') ;
+      case 'euclideanloss'
+        fprintf(' err %.1f', info.val.error(end) / n);
+        fprintf('\n') ;
+    end
+  end
+  end
+
+  % save
+  info.train.objective(end) = info.train.objective(end) / numel(train) ;
+  info.train.error(end) = info.train.error(end) / numel(train)  ;
+  info.train.topFiveError(end) = info.train.topFiveError(end) / numel(train) ;
+  info.train.speed(end) = numel(train) / info.train.speed(end) ;
+  info.val.objective(end) = info.val.objective(end) / numel(val) ;
+  info.val.error(end) = info.val.error(end) / numel(val) ;
+  info.val.topFiveError(end) = info.val.topFiveError(end) / numel(val) ;
+  info.val.speed(end) = numel(val) / info.val.speed(end) ;
+  if epoch == 1 || rem(epoch, opts.snapshot) == 0 || epoch == opts.numEpochs
+  save(modelPath(epoch), 'net', 'info') ;
+  end
+
+  if epoch == 1 || rem(epoch, opts.display) == 0 || epoch == opts.numEpochs
+  figure(1) ; clf ;
+  subplot(1,2,1) ;
+  semilogy(1:epoch, info.train.objective, 'k') ; hold on ;
+  semilogy([1 opts.test_interval : opts.test_interval : epoch epoch], info.val.objective([1 opts.test_interval : opts.test_interval : epoch epoch]), 'b') ;
+  xlabel('training epoch') ; ylabel('energy') ;
+  grid on ;
+  h=legend('train', 'val') ;
+  set(h,'color','none');
+  title('objective') ;
+  subplot(1,2,2) ;
+  switch opts.errorType
+    case 'multiclass'
+      plot(1:epoch, info.train.error, 'k') ; hold on ;
+      plot(1:epoch, info.train.topFiveError, 'k--') ;
+      plot([1 opts.test_interval : opts.test_interval : epoch epoch], info.val.error([1 opts.test_interval : opts.test_interval : epoch epoch]), 'b') ;
+      plot([1 opts.test_interval : opts.test_interval : epoch epoch], info.val.topFiveError([1 opts.test_interval : opts.test_interval : epoch epoch]), 'b--') ;
+      h=legend('train','train-5','val','val-5') ;
+    case 'binary'
+      plot(1:epoch, info.train.error, 'k') ; hold on ;
+      plot([1 opts.test_interval : opts.test_interval : epoch epoch], info.val.error([1 opts.test_interval : opts.test_interval : epoch epoch]), 'b') ;
+      h=legend('train','val') ;
+    case 'euclideanloss'
+      plot(1 : epoch, info.train.error, 'k'); hold on;
+      plot([1 opts.test_interval : opts.test_interval : epoch epoch], info.val.error([1 opts.test_interval : opts.test_interval : epoch epoch]), 'b') ;
+      h = legend('train', 'val') ;
+  end
+  grid on ;
+  xlabel('training epoch') ; ylabel('error') ;
+  set(h,'color','none') ;
+  title('error') ;
+  drawnow ;
+  print(1, modelFigPath, '-dpdf') ;
+  end
+end
+
+% -------------------------------------------------------------------------
+function info = updateError(opts, info, net, res, speed)
+% -------------------------------------------------------------------------
+predictions = gather(res(end-1).x) ;
+sz = size(predictions) ;
+n = prod(sz(1:2)) ;
+
+labels = net.layers{end}.class ;
+info.objective(end) = info.objective(end) + sum(double(gather(res(end).x))) ;
+info.speed(end) = info.speed(end) + speed ;
+switch opts.errorType
+  case 'multiclass'
+    [~,predictions] = sort(predictions, 3, 'descend') ;
+    error = ~bsxfun(@eq, predictions, reshape(labels, 1, 1, 1, [])) ;
+    info.error(end) = info.error(end) +....
+      sum(sum(sum(error(:,:,1,:))))/n ;
+    info.topFiveError(end) = info.topFiveError(end) + ...
+      sum(sum(sum(min(error(:,:,1:5,:),[],3))))/n ;
+  case 'binary'
+    error = bsxfun(@times, predictions, labels) < 0 ;
+    info.error(end) = info.error(end) + sum(error(:))/n ;
+  case 'euclideanloss'
+    error = euclideanloss(sigmoid(predictions), labels);
+    info.error(end) = info.error(end) + error;
+end
+
+
+
diff --git a/examples/mnistAutoencoder/euclideanloss.m b/examples/mnistAutoencoder/euclideanloss.m
new file mode 100644
index 00000000..b5a795c9
--- /dev/null
+++ b/examples/mnistAutoencoder/euclideanloss.m
@@ -0,0 +1,26 @@
+function Y = euclideanloss(X, c, dzdy)
+%EUCLIDEANLOSS Summary of this function goes here
+%   Detailed explanation goes here
+
+assert(numel(X) == numel(c));
+
+d = size(X);
+
+assert(all(d == size(c)));
+
+if nargin == 2 || (nargin == 3 && isempty(dzdy))
+    
+    Y = 1 / 2 * sum(subsref((X - c) .^ 2, substruct('()', {':'}))); % Y is divided by d(4) in cnn_train.m / cnn_train_mgpu.m.
+%     Y = 1 / (2 * prod(d(1 : 3))) * sum(subsref((X - c) .^ 2, substruct('()', {':'}))); % Should Y be divided by prod(d(1 : 3))? It depends on the learning rate.
+    
+elseif nargin == 3 && ~isempty(dzdy)
+    
+    assert(numel(dzdy) == 1);
+    
+    Y = dzdy * (X - c); % Y is divided by d(4) in cnn_train.m / cnn_train_mgpu.m.
+%     Y = dzdy / prod(d(1 : 3)) * (X - c); % Should Y be divided by prod(d(1 : 3))? It depends on the learning rate.
+    
+end
+
+end
+
diff --git a/examples/mnistAutoencoder/sigmoid.m b/examples/mnistAutoencoder/sigmoid.m
new file mode 100644
index 00000000..9d2bad38
--- /dev/null
+++ b/examples/mnistAutoencoder/sigmoid.m
@@ -0,0 +1,16 @@
+function y = sigmoid(x, dzdy)
+%SIGMOID Summary of this function goes here
+%   Detailed explanation goes here
+
+y = 1 ./ (1 + exp(-x));
+
+if nargin == 2 && ~isempty(dzdy)
+    
+    assert(all(size(x) == size(dzdy)));
+    
+    y = dzdy .* y .* (1 - y);
+    
+end
+
+end
+
diff --git a/examples/mnistAutoencoder/sigmoidcrossentropyloss.m b/examples/mnistAutoencoder/sigmoidcrossentropyloss.m
new file mode 100644
index 00000000..d09ffbb6
--- /dev/null
+++ b/examples/mnistAutoencoder/sigmoidcrossentropyloss.m
@@ -0,0 +1,29 @@
+function Y = sigmoidcrossentropyloss(X, c, dzdy)
+%EUCLIDEANLOSS Summary of this function goes here
+%   Detailed explanation goes here
+
+assert(numel(X) == numel(c));
+
+d = size(X);
+
+assert(all(d == size(c)));
+
+p     = sigmoid(c);
+p_hat = sigmoid(X);
+
+if nargin == 2 || isempty(dzdy)
+    
+    Y = -sum(subsref(p * log(p_hat) + (1 - p) * log(1 - p_hat), substruct('()', {':'}))); % Y is divided by d(4) in cnn_train.m / cnn_train_mgpu.m.
+%     Y = -1 / prod(d(1 : 3)) * sum(subsref(p * log(p_hat) + (1 - p) * log(1 - p_hat), substruct('()', {':'}))); % Should Y be divided by prod(d(1 : 3))? It depends on the learning rate.
+    
+elseif nargin == 3 && ~isempty(dzdy)
+    
+    assert(numel(dzdy) == 1);
+    
+    Y = dzdy * (p_hat - p); % Y is divided by d(4) in cnn_train.m / cnn_train_mgpu.m.
+%     Y = dzdy / prod(d(1 : 3)) * (p_hat - p); % Should Y be divided by prod(d(1 : 3))? It depends on the learning rate.
+    
+end
+
+end
+
diff --git a/examples/mnistAutoencoder/vl_simplenn.m b/examples/mnistAutoencoder/vl_simplenn.m
new file mode 100644
index 00000000..5a3c18cd
--- /dev/null
+++ b/examples/mnistAutoencoder/vl_simplenn.m
@@ -0,0 +1,249 @@
+function res = vl_simplenn(net, x, dzdy, res, varargin)
+% VL_SIMPLENN  Evaluates a simple CNN
+%   RES = VL_SIMPLENN(NET, X) evaluates the convnet NET on data X.
+%   RES = VL_SIMPLENN(NET, X, DZDY) evaluates the convnent NET and its
+%   derivative on data X and output derivative DZDY.
+%
+%   The network has a simple (linear) topology, i.e. the computational
+%   blocks are arranged in a sequence of layers. Please note that
+%   there is no need to use this wrapper, which is provided for
+%   convenience. Instead, the individual CNN computational blocks can
+%   be evaluated directly, making it possible to create significantly
+%   more complex topologies, and in general allowing greater
+%   flexibility.
+%
+%   The NET structure contains two fields:
+%
+%   - net.layers: the CNN layers.
+%   - net.normalization: information on how to normalize input data.
+%
+%   The network expects the data X to be already normalized. This
+%   usually involves rescaling the input image(s) and subtracting a
+%   mean.
+%
+%   RES is a structure array with one element per network layer plus
+%   one representing the input. So RES(1) refers to the zeroth-layer
+%   (input), RES(2) refers to the first layer, etc. Each entry has
+%   fields:
+%
+%   - res(i+1).x: the output of layer i. Hence res(1).x is the network
+%     input.
+%
+%   - res(i+1).aux: auxiliary output data of layer i. For example,
+%     dropout uses this field to store the dropout mask.
+%
+%   - res(i+1).dzdx: the derivative of the network output relative to
+%     variable res(i+1).x, i.e. the output of layer i. In particular
+%     res(1).dzdx is the derivative of the network output with respect
+%     to the network input.
+%
+%   - res(i+1).dzdw: the derivative of the network output relative to
+%     the parameters of layer i. It can be a cell array for multiple
+%     parameters.
+%
+%   net.layers is a cell array of network layers. The following
+%   layers, encapsulating corresponding functions in the toolbox, are
+%   supported:
+%
+%   Convolutional layer::
+%     The convolutional layer wraps VL_NNCONV(). It has fields:
+%
+%     - layer.type = 'conv'
+%     - layer.filters: the filters.
+%     - layer.biases: the biases.
+%     - layer.stride: the sampling stride (usually 1).
+%     - layer.padding: the padding (usually 0).
+%
+%   Max pooling layer::
+%     The max pooling layer wraps VL_NNPOOL(). It has fields:
+%
+%     - layer.type = 'pool'
+%     - layer.method: pooling method ('max' or 'avg').
+%     - layer.pool: the pooling size.
+%     - layer.stride: the sampling stride (usually 1).
+%     - layer.padding: the padding (usually 0).
+%
+%   Normalization layer::
+%     The normalization layer wraps VL_NNNORMALIZE(). It has fields
+%
+%     - layer.type = 'normalize'
+%     - layer.param: the normalization parameters.
+%
+%   ReLU layer::
+%     The ReLU layer wraps VL_NNRELU(). It has fields:
+%
+%     - layer.type = 'relu'
+%
+%   Dropout layer::
+%     The dropout layer wraps VL_NNDROPOUT(). It has fields:
+%
+%     - layer.type = 'dropout'
+%     - layer.rate: the dropout rate.
+%
+%   Softmax layer::
+%     The softmax layer wraps VL_NNSOFTMAX(). It has fields
+%
+%     - layer.type = 'softmax'
+%
+%   Log-loss layer::
+%     The log-loss layer wraps VL_NNLOSS(). It has fields:
+%
+%     - layer.type = 'loss'
+%     - layer.class: the ground-truth class.
+%
+%   Softmax-log-loss layer::
+%     The softmax-log-loss layer wraps VL_NNSOFTMAXLOSS(). It has
+%     fields:
+%
+%     - layer.type = 'softmaxloss'
+%     - layer.class: the ground-truth class.
+%
+%   Custom layer::
+%     This can be used to specify custom layers.
+%
+%     - layer.type = 'custom'
+%     - layer.forward: a function handle computing the block.
+%     - layer.backward: a function handle computing the block derivative.
+%
+%     The first function is called as res(i+1) = forward(layer, res(i), res(i+1))
+%     where res() is the struct array specified before. The second function is
+%     called as res(i) = backward(layer, res(i), res(i+1)). Note that the
+%     `layer` structure can contain additional fields if needed.
+
+
+% Copyright (C) 2014 Andrea Vedaldi.
+% All rights reserved.
+%
+% This file is part of the VLFeat library and is made available under
+% the terms of the BSD license (see the COPYING file).
+
+opts.res = [] ;
+opts.conserveMemory = false ;
+opts.sync = false ;
+opts.disableDropout = false ;
+opts.freezeDropout = false ;
+opts = vl_argparse(opts, varargin);
+
+n = numel(net.layers) ;
+
+if (nargin <= 2) || isempty(dzdy)
+  doder = false ;
+else
+  doder = true ;
+end
+
+gpuMode = isa(x, 'gpuArray') ;
+
+if nargin <= 3 || isempty(res)
+  res = struct(...
+    'x', cell(1,n+1), ...
+    'dzdx', cell(1,n+1), ...
+    'dzdw', cell(1,n+1), ...
+    'aux', cell(1,n+1), ...
+    'time', num2cell(zeros(1,n+1)), ...
+    'backwardTime', num2cell(zeros(1,n+1))) ;
+end
+res(1).x = x ;
+
+for i=1:n
+  l = net.layers{i} ;
+  res(i).time = tic ;
+  switch l.type
+    case 'conv'
+      res(i+1).x = vl_nnconv(res(i).x, l.filters, l.biases, 'pad', l.pad, 'stride', l.stride) ;
+    case 'pool'
+      res(i+1).x = vl_nnpool(res(i).x, l.pool, 'pad', l.pad, 'stride', l.stride, 'method', l.method) ;
+    case 'normalize'
+      res(i+1).x = vl_nnnormalize(res(i).x, l.param) ;
+    case 'softmax'
+      res(i+1).x = vl_nnsoftmax(res(i).x) ;
+    case 'loss'
+      res(i+1).x = vl_nnloss(res(i).x, l.class) ;
+    case 'softmaxloss'
+      res(i+1).x = vl_nnsoftmaxloss(res(i).x, l.class) ;
+    case 'relu'
+      res(i+1).x = vl_nnrelu(res(i).x) ;
+    case 'noffset'
+      res(i+1).x = vl_nnnoffset(res(i).x, l.param) ;
+    case 'dropout'
+      if opts.disableDropout
+        res(i+1).x = res(i).x ;
+      elseif opts.freezeDropout
+        [res(i+1).x, res(i+1).aux] = vl_nndropout(res(i).x, 'rate', l.rate, 'mask', res(i+1).aux) ;
+      else
+        [res(i+1).x, res(i+1).aux] = vl_nndropout(res(i).x, 'rate', l.rate) ;
+      end
+    case 'custom'
+      res(i+1) = l.forward(l, res(i), res(i+1)) ;
+    case 'sigmoid'
+      res(i+1).x = sigmoid(res(i).x);
+    case 'sigmoidcrossentropyloss'
+      res(i+1).x = euclideanloss(res(i).x, l.class);
+    case 'euclideanloss'
+      res(i+1).x = euclideanloss(res(i).x, l.class);
+    otherwise
+      error('Unknown layer type %s', l.type) ;
+  end
+  if opts.conserveMemory & ~doder & i < numel(net.layers) - 1
+    % TODO: forget unnecesary intermediate computations even when
+    % derivatives are required
+    res(i).x = [] ;
+  end
+  if gpuMode & opts.sync
+    % This should make things slower, but on MATLAB 2014a it is necessary
+    % for any decent performance.
+    wait(gpuDevice) ;
+  end
+  res(i).time = toc(res(i).time) ;
+end
+
+if doder
+  res(n+1).dzdx = dzdy ;
+  for i=n:-1:1
+    l = net.layers{i} ;
+    res(i).backwardTime = tic ;
+    switch l.type
+      case 'conv'
+        [res(i).dzdx, res(i).dzdw{1}, res(i).dzdw{2}] = ...
+            vl_nnconv(res(i).x, l.filters, l.biases, ...
+                      res(i+1).dzdx, ...
+                      'pad', l.pad, 'stride', l.stride) ;
+      case 'pool'
+        res(i).dzdx = vl_nnpool(res(i).x, l.pool, res(i+1).dzdx, ...
+          'pad', l.pad, 'stride', l.stride, 'method', l.method) ;
+      case 'normalize'
+        res(i).dzdx = vl_nnnormalize(res(i).x, l.param, res(i+1).dzdx) ;
+      case 'softmax'
+        res(i).dzdx = vl_nnsoftmax(res(i).x, res(i+1).dzdx) ;
+      case 'loss'
+        res(i).dzdx = vl_nnloss(res(i).x, l.class, res(i+1).dzdx) ;
+      case 'softmaxloss'
+        res(i).dzdx = vl_nnsoftmaxloss(res(i).x, l.class, res(i+1).dzdx) ;
+      case 'relu'
+        res(i).dzdx = vl_nnrelu(res(i).x, res(i+1).dzdx) ;
+      case 'noffset'
+        res(i).dzdx = vl_nnnoffset(res(i).x, l.param, res(i+1).dzdx) ;
+      case 'dropout'
+        if opts.disableDropout
+          res(i).dzdx = res(i+1).dzdx ;
+        else
+          res(i).dzdx = vl_nndropout(res(i).x, res(i+1).dzdx, 'mask', res(i+1).aux) ;
+        end
+      case 'custom'
+        res(i) = l.backward(l, res(i), res(i+1)) ;
+      case 'sigmoid'
+        res(i).dzdx = sigmoid(res(i).x, res(i+1).dzdx);
+      case 'sigmoidcrossentropyloss'
+        res(i).dzdx = sigmoidcrossentropyloss(res(i).x, l.class, res(i+1).dzdx);
+      case 'euclideanloss'
+        res(i).dzdx = euclideanloss(res(i).x, l.class, res(i+1).dzdx);
+    end
+    if opts.conserveMemory
+      res(i+1).dzdx = [] ;
+    end
+    if gpuMode & opts.sync
+      wait(gpuDevice) ;
+    end
+    res(i).backwardTime = toc(res(i).backwardTime) ;
+  end
+end