diff --git a/.gitmodules b/.gitmodules
index 1058fd4..58587e3 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,9 +1,3 @@
-[submodule "3rdparty/src/gflags"]
-	path = 3rdparty/src/gflags
-	url = https://github.com/gflags/gflags.git
-[submodule "3rdparty/src/glog"]
-	path = 3rdparty/src/glog
-	url = https://github.com/google/glog.git
 [submodule "3rdparty/src/protobuf"]
 	path = 3rdparty/src/protobuf
 	url = https://github.com/google/protobuf.git
diff --git a/3rdparty/include/.gitignore b/3rdparty/include/.gitignore
index 5cb3ece..42d3e6b 100644
--- a/3rdparty/include/.gitignore
+++ b/3rdparty/include/.gitignore
@@ -1,3 +1,2 @@
-gflags/
 glog/
 google/
diff --git a/3rdparty/include/getopt.h b/3rdparty/include/getopt.h
deleted file mode 100644
index f3696d9..0000000
--- a/3rdparty/include/getopt.h
+++ /dev/null
@@ -1,133 +0,0 @@
-/* Declarations for getopt.
-   Copyright (C) 1989, 90, 91, 92, 93, 94 Free Software Foundation, Inc.
-
-This file is part of the GNU C Library.  Its master source is NOT part of
-the C library, however.  The master source lives in /gd/gnu/lib.
-
-The GNU C Library is free software; you can redistribute it and/or
-modify it under the terms of the GNU Library General Public License as
-published by the Free Software Foundation; either version 2 of the
-License, or (at your option) any later version.
-
-The GNU C Library is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-Library General Public License for more details.
-
-You should have received a copy of the GNU Library General Public
-License along with the GNU C Library; see the file COPYING.LIB.  If
-not, write to the Free Software Foundation, Inc., 675 Mass Ave,
-Cambridge, MA 02139, USA.  */
-
-#ifndef _GETOPT_H
-#define _GETOPT_H 1
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-/* For communication from `getopt' to the caller.
-   When `getopt' finds an option that takes an argument,
-   the argument value is returned here.
-   Also, when `ordering' is RETURN_IN_ORDER,
-   each non-option ARGV-element is returned here.  */
-
-extern char *optarg;
-
-/* Index in ARGV of the next element to be scanned.
-   This is used for communication to and from the caller
-   and for communication between successive calls to `getopt'.
-
-   On entry to `getopt', zero means this is the first call; initialize.
-
-   When `getopt' returns EOF, this is the index of the first of the
-   non-option elements that the caller should itself scan.
-
-   Otherwise, `optind' communicates from one call to the next
-   how much of ARGV has been scanned so far.  */
-
-extern int optind;
-
-/* Callers store zero here to inhibit the error message `getopt' prints
-   for unrecognized options.  */
-
-extern int opterr;
-
-/* Set to an option character which was unrecognized.  */
-
-extern int optopt;
-
-/* Describe the long-named options requested by the application.
-   The LONG_OPTIONS argument to getopt_long or getopt_long_only is a vector
-   of `struct option' terminated by an element containing a name which is
-   zero.
-
-   The field `has_arg' is:
-   no_argument		(or 0) if the option does not take an argument,
-   required_argument	(or 1) if the option requires an argument,
-   optional_argument 	(or 2) if the option takes an optional argument.
-
-   If the field `flag' is not NULL, it points to a variable that is set
-   to the value given in the field `val' when the option is found, but
-   left unchanged if the option is not found.
-
-   To have a long-named option do something other than set an `int' to
-   a compiled-in constant, such as set a value from `optarg', set the
-   option's `flag' field to zero and its `val' field to a nonzero
-   value (the equivalent single-letter option character, if there is
-   one).  For long options that have a zero `flag' field, `getopt'
-   returns the contents of the `val' field.  */
-
-struct option
-{
-#if defined (__STDC__) && __STDC__
-  const char *name;
-#else
-  char *name;
-#endif
-  /* has_arg can't be an enum because some compilers complain about
-     type mismatches in all the code that assumes it is an int.  */
-  int has_arg;
-  int *flag;
-  int val;
-};
-
-/* Names for the values of the `has_arg' field of `struct option'.  */
-
-#define	no_argument		0
-#define required_argument	1
-#define optional_argument	2
-
-#if defined (__STDC__) && __STDC__
-#ifdef __GNU_LIBRARY__
-/* Many other libraries have conflicting prototypes for getopt, with
-   differences in the consts, in stdlib.h.  To avoid compilation
-   errors, only prototype getopt for the GNU C library.  */
-extern int getopt (int argc, char *const *argv, const char *shortopts);
-#else /* not __GNU_LIBRARY__ */
-extern int getopt ();
-#endif /* __GNU_LIBRARY__ */
-extern int getopt_long (int argc, char *const *argv, const char *shortopts,
-		        const struct option *longopts, int *longind);
-extern int getopt_long_only (int argc, char *const *argv,
-			     const char *shortopts,
-		             const struct option *longopts, int *longind);
-
-/* Internal only.  Users should not call this directly.  */
-extern int _getopt_internal (int argc, char *const *argv,
-			     const char *shortopts,
-		             const struct option *longopts, int *longind,
-			     int long_only);
-#else /* not __STDC__ */
-extern int getopt ();
-extern int getopt_long ();
-extern int getopt_long_only ();
-
-extern int _getopt_internal ();
-#endif /* __STDC__ */
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif /* _GETOPT_H */
diff --git a/3rdparty/include/mkstemp.h b/3rdparty/include/mkstemp.h
deleted file mode 100644
index a131cb1..0000000
--- a/3rdparty/include/mkstemp.h
+++ /dev/null
@@ -1,18 +0,0 @@
-/* mkstemp extracted from libc/sysdeps/posix/tempname.c.  Copyright
-   (C) 1991-1999, 2000, 2001, 2006 Free Software Foundation, Inc.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.  */
-
-#pragma once
-
-static const char letters[] =
-"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
-
-/* Generate a temporary file name based on TMPL.  TMPL must match the
-   rules for mk[s]temp (i.e. end in "XXXXXX").  The name constructed
-   does not exist at the time of the call to mkstemp.  TMPL is
-   overwritten with the result.  */
-int mkstemp(char *tmpl);
\ No newline at end of file
diff --git a/3rdparty/include/unistd.h b/3rdparty/include/unistd.h
deleted file mode 100644
index eabf24f..0000000
--- a/3rdparty/include/unistd.h
+++ /dev/null
@@ -1,52 +0,0 @@
-#ifndef _UNISTD_H
-#define _UNISTD_H    1
-
-/* This file intended to serve as a drop-in replacement for
-*  unistd.h on Windows
-*  Please add functionality as needed
-*/
-
-#include <stdlib.h>
-#include <io.h>
-#include <getopt.h> /* getopt at: https://gist.github.com/ashelly/7776712 */
-#include <process.h> /* for getpid() and the exec..() family */
-#include <direct.h> /* for _getcwd() and _chdir() */
-
-#define srandom srand
-#define random rand
-
-/* Values for the second argument to access.
-These may be OR'd together.  */
-#define R_OK    4       /* Test for read permission.  */
-#define W_OK    2       /* Test for write permission.  */
-//#define   X_OK    1       /* execute permission - unsupported in windows*/
-#define F_OK    0       /* Test for existence.  */
-
-#define access _access
-#define dup2 _dup2
-#define execve _execve
-#define ftruncate _chsize
-#define unlink _unlink
-#define fileno _fileno
-#define getcwd _getcwd
-#define chdir _chdir
-#define isatty _isatty
-#define lseek _lseek
-/* read, write, and close are NOT being #defined here, because while there are file handle specific versions for Windows, they probably don't work for sockets. You need to look at your app and consider whether to call e.g. closesocket(). */
-
-#define ssize_t int
-
-#define STDIN_FILENO 0
-#define STDOUT_FILENO 1
-#define STDERR_FILENO 2
-/* should be in some equivalent to <sys/types.h> */
-//typedef __int8            int8_t;
-//typedef __int16           int16_t;
-//typedef __int32           int32_t;
-//typedef __int64           int64_t;
-//typedef unsigned __int8   uint8_t;
-//typedef unsigned __int16  uint16_t;
-//typedef unsigned __int32  uint32_t;
-//typedef unsigned __int64  uint64_t;
-
-#endif /* unistd.h  */
\ No newline at end of file
diff --git a/3rdparty/src/gflags b/3rdparty/src/gflags
deleted file mode 160000
index f0523f1..0000000
--- a/3rdparty/src/gflags
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit f0523f14a93cbb46fff9b318508aa1c6923158c7
diff --git a/3rdparty/src/glog b/3rdparty/src/glog
deleted file mode 160000
index 2a02db7..0000000
--- a/3rdparty/src/glog
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 2a02db7aa5f3a832f0d215aeb395d5c218ad8f3e
diff --git a/README.md b/README.md
index 780462b..22e813d 100644
--- a/README.md
+++ b/README.md
@@ -1,8 +1,12 @@
-mini-caffe
+Mini-Caffe
 ==========
 
 Minimal runtime core of Caffe porting to WIN32. This repo is aimed to provide a minimal runtime of Caffe for those want to run Caffe model on Windows platform.
 
+### Update
+
+- 2016/12/11. Mini-Caffe now only depends on OpenBLAS and protobuf.
+
 ### What can mini-caffe do?
 
 This repo has no CUDA, no Caffe tools which means you can only use mini-caffe to run the nerual network model in CPU mode. You should train the nerual model use caffe tools on *nix platform, mini-caffe is just an optional choice for testing the nerual model on Windows platform. If you want a fully ported Caffe, you may refer to [happynear/caffe-windows](https://github.com/happynear/caffe-windows).
@@ -21,18 +25,18 @@ Since Caffe depends on many 3rdparty libraries, I have modified some code to rem
 
 but we still need libraries below.
 
-- OpenCV
-- Boost
-- gflags
-- glog
+- ~~OpenCV~~
+- ~~Boost~~
+- ~~gflags~~
+- ~~glog~~
 - protobuf
 - openblas
 
-We can download pre-compiled OpenCV and Boost, and set two environment variables `OpenCV_DIR` and `Boost_DIR`. For example, `OpenCV_DIR` = `D:\3rdparty\opencv2.4.8\build` and `Boost_DIR` = `D:\3rdparty\boost_1_57_0`. Pay attention to the Compiler version and build Architecture, which will be **VC12** and **x86**.
+~~We can download pre-compiled OpenCV and Boost, and set two environment variables `OpenCV_DIR` and `Boost_DIR`. For example, `OpenCV_DIR` = `D:\3rdparty\opencv2.4.8\build` and `Boost_DIR` = `D:\3rdparty\boost_1_57_0`. Pay attention to the Compiler version and build Architecture, which will be **VC12** and **x86**.~~
 
 For openblas, I already put the library in the source code.
 
-gflags, glog, protobuf can be compiled by ourself, I add these libraries as submodules of mini-caffe. However, I also provide a pre-compiled version of these libraries. The binaries is compiled by VC12 for x86. You can download from [dropbox](https://www.dropbox.com/s/8zbimuiviiyede5/3rdparty-VC12-x86.zip?dl=0) or [baidu driver](http://pan.baidu.com/s/1hqOoCL2).
+~~gflags, glog,~~ protobuf can be compiled by ourself, I add these libraries as submodules of mini-caffe. However, I also provide a pre-compiled version of these libraries. The binaries is compiled by VC12 for x86. You can download from [dropbox](https://www.dropbox.com/s/8zbimuiviiyede5/3rdparty-VC12-x86.zip?dl=0) or [baidu driver](http://pan.baidu.com/s/1hqOoCL2).
 
 To compile these libraries yourself, you should download the source code first.
 
@@ -42,21 +46,6 @@ git submodule update --init
 
 all source code are under `3rdparty/src`.
 
-##### gflags
-
-```
-cd 3rdparty/src/gflags
-mkdir build
-cd build
-cmake ..
-```
-
-use VS2013 to compile gflags. `Debug` and `Release`.
-
-##### glog
-
-glog project already provides a solution file for Visual Studio. Just compile `Debug` and `Release`.
-
 ##### protobuf
 
 ```
diff --git a/README.zh.md b/README.zh.md
deleted file mode 100644
index b995f18..0000000
--- a/README.zh.md
+++ /dev/null
@@ -1,97 +0,0 @@
-mini-caffe
-==========
-
-mini-caffe 为 Windows 提供一个 Caffe 的最小运行环境。请使用 **VS2013** 编译项目，**VS2012**及以下版本不保证依赖库能够正常编译成功。亲测 VS2012 无法编译 cmake 生成的 protobuf 库。
-
-### 安装
-
-首先利用 git 克隆该仓库
-
-```
-git clone https://github.com/luoyetx/mini-caffe.git
-cd mini-caffe
-git submodule update --init --recursive
-```
-
-设置环境变量 `OpenCV_DIR` 指向 OpenCV 安装目录，比如 `D:\3rdparty\opencv2.4.8\build`。设置 `BOOST_DIR` 指向 Boost 安装目录，比如 `D:\3rdparty\boost_1_57_0`，注意如果你下载的是事先编译好的 Boost 库，请把库目录（包含有lib文件的目录改成`stage\lib`），如果你是自己源码编译的，那就不用管了。注意 Boost 库使用 32 位的。
-
-### 编译依赖库
-
-我裁剪了 Caffe 的源码，将大部分数据层的代码都删除了，只有 MemoryDataLayer 了，这样做可以极大地减少第三方库的依赖和编译。同时裁剪过的库只使用 CPU 模式，网络的数据层只使用内存数据。经过裁剪之后的 Caffe 只依赖如下这些库。
-
-* OpenCV
-* Boost
-* gflags
-* glog(部分功能没有 Windows 的实现，暴力地将 Caffe 中用到的代码注释掉了)
-* protobuf
-* OpenBLAS
-
-OpenCV 和 Boost 我们一般使用事先编译好的，直接在 CMakeLists.txt 中使用，而 OpenBLAS 的库我已经直接添加在了项目的源码树中，是事先编译好的 32 库，下载连接在[这里](http://sourceforge.net/projects/openblas/files/v0.2.14/OpenBLAS-v0.2.14-Win32.zip/download)，可以视情况自行下载更换。下面介绍如何源码编译另外三个库。
-
-##### gflags
-
-gflags 使用了 CMake 构建工程，注意一下 CMake 的版本，如果版本过低，请去[CMake官网](http://www.cmake.org/download/)下载最新版。
-
-```
-cd 3rdparty/src/gflags
-mkdir build
-cd build
-cmake ..
-```
-
-在利用 CMake 生成 VS 工程文件之后，直接打开工程文件编译 `Debug` 和 `Release` 两个版本。
-
-##### glog
-
-glog 提供了 VS 的工程文件，我们直接打开工程文件 `google-glog.sln`，因为 VS 版本的关系，可能要升级工程文件，这些都是自动的，点确定后就不必在意了。打开工程之后直接编译 `Debug` 和 `Release` 两个版本。
-
-##### protobuf
-
-protobuf 也使用 CMake 构建工程，注意以下 cmake 时的参数
-
-```
-cd 3rdparty/src/protobuf/cmake
-mkdir build
-cd build
-cmake .. -DBUILD_TESTING=OFF
-```
-
-和之前一样，直接打开工程文件后编译 `Debug` 和 `Release` 两个版本。
-
-### 编译 Caffe 前的最后准备
-
-我们需要收集各个库的头文件，库文件和 dll 文件，我写了个 `copydeps.bat` 脚本，直接双击运行，会将所有需要的文件复制到 3rdparty 指定的目录下。
-
-我们还需要使用 protobuf 预处理 Caffe 中的 `caffe.proto` 文件，用来生成头文件和源文件，我写了 `generatebp.bat` 脚本自动调用 `srdparty\bin\protoc.exe` 生成头文件和源文件，并将其放到指定的目录下，直接双击运行就可以了。
-
-做完上述准备后，我们可以 cmake 来生成 Caffe 的 VS 工程文件，在源码树根目录下创建 build 目录
-
-```
-mkdir build
-cd build
-cmake ..
-```
-
-### 编译 Caffe
-
-打开生成的 VS 工程文件就可以编译 Caffe 代码了，我配置了 CMakeLists.txt 生成 Caffe 静态库。
-
-### 将 mini-caffe 作为工程的一部分
-
-现在因为一些原因，还不能直接将上述编译生成的 `libcaffe.lib` 通过静态链接的方式加入到其他项目中，需要将 mini-caffe 的源码作为项目的一部分参与编译，我写了 `mini-caffe.cmake` 文件可以方便的将 mini-caffe 整个项目的源码作为其他项目源码的一部分，只要在相应的 CMakeLists.txt 包含这个文件即可。如下面的例子。
-
-```
-+ example
-|__CMakeLists.txt
-|__mini-caffe
-|  |__***
-|  |__mini-caffe.cmake
-|  |__***
-|
-|__example.cpp
-|__example.hpp
-```
-
-在 example 项目的 CMakeLists.txt 中加入 `include(mini-caffe/mini-caffe.cmake)` 就可以将 mini-caffe 作为项目的一部分参与编译。
-
-具体项目结构可以参考[mini-caffe-example](https://github.com/luoyetx/mini-caffe-example)的项目配置。
diff --git a/copydeps.bat b/copydeps.bat
index 5d3b5af..3ed456a 100644
--- a/copydeps.bat
+++ b/copydeps.bat
@@ -1,10 +1,3 @@
-mkdir 3rdparty\include\gflags
-copy 3rdparty\src\gflags\build\include\gflags 3rdparty\include\gflags
-copy 3rdparty\src\gflags\build\lib\Debug\gflags.lib 3rdparty\lib\gflagsd.lib
-copy 3rdparty\src\gflags\build\lib\Debug\gflags_nothreads.lib 3rdparty\lib\gflags_nothreadsd.lib
-copy 3rdparty\src\gflags\build\lib\Release\gflags.lib 3rdparty\lib\gflags.lib
-copy 3rdparty\src\gflags\build\lib\Release\gflags_nothreads.lib 3rdparty\lib\gflags_nothreads.lib
-
 mkdir 3rdparty\include\glog
 copy 3rdparty\src\glog\src\windows\glog 3rdparty\include\glog
 copy 3rdparty\src\glog\Debug\libglog.lib 3rdparty\lib\libglogd.lib
diff --git a/include/caffe/caffe.hpp b/include/caffe/caffe.hpp
index 0688209..935344e 100644
--- a/include/caffe/caffe.hpp
+++ b/include/caffe/caffe.hpp
@@ -10,11 +10,7 @@
 #include "caffe/layer.hpp"
 #include "caffe/layer_factory.hpp"
 #include "caffe/net.hpp"
-#include "caffe/parallel.hpp"
 #include "caffe/proto/caffe.pb.h"
-#include "caffe/solver.hpp"
-#include "caffe/solver_factory.hpp"
-#include "caffe/util/benchmark.hpp"
 #include "caffe/util/io.hpp"
 #include "caffe/util/upgrade_proto.hpp"
 
diff --git a/include/caffe/common.hpp b/include/caffe/common.hpp
index 3c6a076..4e96417 100644
--- a/include/caffe/common.hpp
+++ b/include/caffe/common.hpp
@@ -1,10 +1,6 @@
 #ifndef CAFFE_COMMON_HPP_
 #define CAFFE_COMMON_HPP_
 
-#include <boost/shared_ptr.hpp>
-#include <gflags/gflags.h>
-#include <glog/logging.h>
-
 #include <climits>
 #include <cmath>
 #include <fstream>  // NOLINT(readability/streams)
@@ -15,22 +11,16 @@
 #include <string>
 #include <utility>  // pair
 #include <vector>
+#include <memory>
+
+#include "caffe/util/logging.hpp"
 
-#include "caffe/util/device_alternate.hpp"
+#define NO_GPU LOG(FATAL) << "Cannot use GPU in CPU-only Caffe: check mode."
 
 // Convert macro to string
 #define STRINGIFY(m) #m
 #define AS_STRING(m) STRINGIFY(m)
 
-// gflags 2.1 issue: namespace google was changed to gflags without warning.
-// Luckily we will be able to use GFLAGS_GFLAGS_H_ to detect if it is version
-// 2.1. If yes, we will add a temporary solution to redirect the namespace.
-// TODO(Yangqing): Once gflags solves the problem in a more elegant way, let's
-// remove the following hack.
-#ifndef GFLAGS_GFLAGS_H_
-namespace gflags = google;
-#endif  // GFLAGS_GFLAGS_H_
-
 // Disable the copy and assignment operator for a class.
 #define DISABLE_COPY_AND_ASSIGN(classname) \
 private:\
@@ -43,42 +33,14 @@ private:\
   template class classname<float>; \
   template class classname<double>
 
-#define INSTANTIATE_LAYER_GPU_FORWARD(classname) \
-  template void classname<float>::Forward_gpu( \
-      const std::vector<Blob<float>*>& bottom, \
-      const std::vector<Blob<float>*>& top); \
-  template void classname<double>::Forward_gpu( \
-      const std::vector<Blob<double>*>& bottom, \
-      const std::vector<Blob<double>*>& top);
-
-#define INSTANTIATE_LAYER_GPU_BACKWARD(classname) \
-  template void classname<float>::Backward_gpu( \
-      const std::vector<Blob<float>*>& top, \
-      const std::vector<bool>& propagate_down, \
-      const std::vector<Blob<float>*>& bottom); \
-  template void classname<double>::Backward_gpu( \
-      const std::vector<Blob<double>*>& top, \
-      const std::vector<bool>& propagate_down, \
-      const std::vector<Blob<double>*>& bottom)
-
-#define INSTANTIATE_LAYER_GPU_FUNCS(classname) \
-  INSTANTIATE_LAYER_GPU_FORWARD(classname); \
-  INSTANTIATE_LAYER_GPU_BACKWARD(classname)
-
 // A simple macro to mark codes that are not implemented, so that when the code
 // is executed we will see a fatal log.
 #define NOT_IMPLEMENTED LOG(FATAL) << "Not Implemented Yet"
 
-// See PR #1236
-namespace cv { class Mat; }
-
 namespace caffe {
 
-// We will use the boost shared_ptr instead of the new C++11 one mainly
-// because cuda does not work (at least now) well with C++11 features.
-using boost::shared_ptr;
-
 // Common functions and classes from std that caffe often uses.
+using std::shared_ptr;
 using std::fstream;
 using std::ios;
 using std::isnan;
@@ -93,95 +55,6 @@ using std::string;
 using std::stringstream;
 using std::vector;
 
-// A global initialization function that you should call in your main function.
-// Currently it initializes google flags and google logging.
-void GlobalInit(int* pargc, char*** pargv);
-
-// A singleton class to hold common caffe stuff, such as the handler that
-// caffe is going to use for cublas, curand, etc.
-class Caffe {
- public:
-  ~Caffe();
-
-  // Thread local context for Caffe. Moved to common.cpp instead of
-  // including boost/thread.hpp to avoid a boost/NVCC issues (#1009, #1010)
-  // on OSX. Also fails on Linux with CUDA 7.0.18.
-  static Caffe& Get();
-
-  enum Brew { CPU, GPU };
-
-  // This random number generator facade hides boost and CUDA rng
-  // implementation from one another (for cross-platform compatibility).
-  class RNG {
-   public:
-    RNG();
-    explicit RNG(unsigned int seed);
-    explicit RNG(const RNG&);
-    RNG& operator=(const RNG&);
-    void* generator();
-   private:
-    class Generator;
-    shared_ptr<Generator> generator_;
-  };
-
-  // Getters for boost rng, curand, and cublas handles
-  inline static RNG& rng_stream() {
-    if (!Get().random_generator_) {
-      Get().random_generator_.reset(new RNG());
-    }
-    return *(Get().random_generator_);
-  }
-#ifndef CPU_ONLY
-  inline static cublasHandle_t cublas_handle() { return Get().cublas_handle_; }
-  inline static curandGenerator_t curand_generator() {
-    return Get().curand_generator_;
-  }
-#endif
-
-  // Returns the mode: running on CPU or GPU.
-  inline static Brew mode() { return Get().mode_; }
-  // The setters for the variables
-  // Sets the mode. It is recommended that you don't change the mode halfway
-  // into the program since that may cause allocation of pinned memory being
-  // freed in a non-pinned way, which may cause problems - I haven't verified
-  // it personally but better to note it here in the header file.
-  inline static void set_mode(Brew mode) { Get().mode_ = mode; }
-  // Sets the random seed of both boost and curand
-  static void set_random_seed(const unsigned int seed);
-  // Sets the device. Since we have cublas and curand stuff, set device also
-  // requires us to reset those values.
-  static void SetDevice(const int device_id);
-  // Prints the current GPU status.
-  static void DeviceQuery();
-  // Check if specified device is available
-  static bool CheckDevice(const int device_id);
-  // Search from start_id to the highest possible device ordinal,
-  // return the ordinal of the first available device.
-  static int FindDevice(const int start_id = 0);
-  // Parallel training info
-  inline static int solver_count() { return Get().solver_count_; }
-  inline static void set_solver_count(int val) { Get().solver_count_ = val; }
-  inline static bool root_solver() { return Get().root_solver_; }
-  inline static void set_root_solver(bool val) { Get().root_solver_ = val; }
-
- protected:
-#ifndef CPU_ONLY
-  cublasHandle_t cublas_handle_;
-  curandGenerator_t curand_generator_;
-#endif
-  shared_ptr<RNG> random_generator_;
-
-  Brew mode_;
-  int solver_count_;
-  bool root_solver_;
-
- private:
-  // The private constructor to avoid duplicate instantiation.
-  Caffe();
-
-  DISABLE_COPY_AND_ASSIGN(Caffe);
-};
-
 }  // namespace caffe
 
 #endif  // CAFFE_COMMON_HPP_
diff --git a/include/caffe/data_reader.hpp b/include/caffe/data_reader.hpp
deleted file mode 100644
index 8ed5542..0000000
--- a/include/caffe/data_reader.hpp
+++ /dev/null
@@ -1,82 +0,0 @@
-#ifndef CAFFE_DATA_READER_HPP_
-#define CAFFE_DATA_READER_HPP_
-
-#include <map>
-#include <string>
-#include <vector>
-
-#include "caffe/common.hpp"
-#include "caffe/internal_thread.hpp"
-#include "caffe/util/blocking_queue.hpp"
-#include "caffe/util/db.hpp"
-
-namespace caffe {
-
-/**
- * @brief Reads data from a source to queues available to data layers.
- * A single reading thread is created per source, even if multiple solvers
- * are running in parallel, e.g. for multi-GPU training. This makes sure
- * databases are read sequentially, and that each solver accesses a different
- * subset of the database. Data is distributed to solvers in a round-robin
- * way to keep parallel training deterministic.
- */
-class DataReader {
- public:
-  explicit DataReader(const LayerParameter& param);
-  ~DataReader();
-
-  inline BlockingQueue<Datum*>& free() const {
-    return queue_pair_->free_;
-  }
-  inline BlockingQueue<Datum*>& full() const {
-    return queue_pair_->full_;
-  }
-
- protected:
-  // Queue pairs are shared between a body and its readers
-  class QueuePair {
-   public:
-    explicit QueuePair(int size);
-    ~QueuePair();
-
-    BlockingQueue<Datum*> free_;
-    BlockingQueue<Datum*> full_;
-
-  DISABLE_COPY_AND_ASSIGN(QueuePair);
-  };
-
-  // A single body is created per source
-  class Body : public InternalThread {
-   public:
-    explicit Body(const LayerParameter& param);
-    virtual ~Body();
-
-   protected:
-    void InternalThreadEntry();
-    void read_one(db::Cursor* cursor, QueuePair* qp);
-
-    const LayerParameter param_;
-    BlockingQueue<shared_ptr<QueuePair> > new_queue_pairs_;
-
-    friend class DataReader;
-
-  DISABLE_COPY_AND_ASSIGN(Body);
-  };
-
-  // A source is uniquely identified by its layer name + path, in case
-  // the same database is read from two different locations in the net.
-  static inline string source_key(const LayerParameter& param) {
-    return param.name() + ":" + param.data_param().source();
-  }
-
-  const shared_ptr<QueuePair> queue_pair_;
-  shared_ptr<Body> body_;
-
-  static map<const string, boost::weak_ptr<DataReader::Body> > bodies_;
-
-DISABLE_COPY_AND_ASSIGN(DataReader);
-};
-
-}  // namespace caffe
-
-#endif  // CAFFE_DATA_READER_HPP_
diff --git a/include/caffe/data_transformer.hpp b/include/caffe/data_transformer.hpp
deleted file mode 100644
index 97b4ee6..0000000
--- a/include/caffe/data_transformer.hpp
+++ /dev/null
@@ -1,154 +0,0 @@
-#ifndef CAFFE_DATA_TRANSFORMER_HPP
-#define CAFFE_DATA_TRANSFORMER_HPP
-
-#include <vector>
-
-#include "caffe/blob.hpp"
-#include "caffe/common.hpp"
-#include "caffe/proto/caffe.pb.h"
-
-namespace caffe {
-
-/**
- * @brief Applies common transformations to the input data, such as
- * scaling, mirroring, substracting the image mean...
- */
-template <typename Dtype>
-class DataTransformer {
- public:
-  explicit DataTransformer(const TransformationParameter& param, Phase phase);
-  virtual ~DataTransformer() {}
-
-  /**
-   * @brief Initialize the Random number generations if needed by the
-   *    transformation.
-   */
-  void InitRand();
-
-  /**
-   * @brief Applies the transformation defined in the data layer's
-   * transform_param block to the data.
-   *
-   * @param datum
-   *    Datum containing the data to be transformed.
-   * @param transformed_blob
-   *    This is destination blob. It can be part of top blob's data if
-   *    set_cpu_data() is used. See data_layer.cpp for an example.
-   */
-  void Transform(const Datum& datum, Blob<Dtype>* transformed_blob);
-
-  /**
-   * @brief Applies the transformation defined in the data layer's
-   * transform_param block to a vector of Datum.
-   *
-   * @param datum_vector
-   *    A vector of Datum containing the data to be transformed.
-   * @param transformed_blob
-   *    This is destination blob. It can be part of top blob's data if
-   *    set_cpu_data() is used. See memory_layer.cpp for an example.
-   */
-  void Transform(const vector<Datum> & datum_vector,
-                Blob<Dtype>* transformed_blob);
-
-#ifdef USE_OPENCV
-  /**
-   * @brief Applies the transformation defined in the data layer's
-   * transform_param block to a vector of Mat.
-   *
-   * @param mat_vector
-   *    A vector of Mat containing the data to be transformed.
-   * @param transformed_blob
-   *    This is destination blob. It can be part of top blob's data if
-   *    set_cpu_data() is used. See memory_layer.cpp for an example.
-   */
-  void Transform(const vector<cv::Mat> & mat_vector,
-                Blob<Dtype>* transformed_blob);
-
-  /**
-   * @brief Applies the transformation defined in the data layer's
-   * transform_param block to a cv::Mat
-   *
-   * @param cv_img
-   *    cv::Mat containing the data to be transformed.
-   * @param transformed_blob
-   *    This is destination blob. It can be part of top blob's data if
-   *    set_cpu_data() is used. See image_data_layer.cpp for an example.
-   */
-  void Transform(const cv::Mat& cv_img, Blob<Dtype>* transformed_blob);
-#endif  // USE_OPENCV
-
-  /**
-   * @brief Applies the same transformation defined in the data layer's
-   * transform_param block to all the num images in a input_blob.
-   *
-   * @param input_blob
-   *    A Blob containing the data to be transformed. It applies the same
-   *    transformation to all the num images in the blob.
-   * @param transformed_blob
-   *    This is destination blob, it will contain as many images as the
-   *    input blob. It can be part of top blob's data.
-   */
-  void Transform(Blob<Dtype>* input_blob, Blob<Dtype>* transformed_blob);
-
-  /**
-   * @brief Infers the shape of transformed_blob will have when
-   *    the transformation is applied to the data.
-   *
-   * @param datum
-   *    Datum containing the data to be transformed.
-   */
-  vector<int> InferBlobShape(const Datum& datum);
-  /**
-   * @brief Infers the shape of transformed_blob will have when
-   *    the transformation is applied to the data.
-   *    It uses the first element to infer the shape of the blob.
-   *
-   * @param datum_vector
-   *    A vector of Datum containing the data to be transformed.
-   */
-  vector<int> InferBlobShape(const vector<Datum> & datum_vector);
-  /**
-   * @brief Infers the shape of transformed_blob will have when
-   *    the transformation is applied to the data.
-   *    It uses the first element to infer the shape of the blob.
-   *
-   * @param mat_vector
-   *    A vector of Mat containing the data to be transformed.
-   */
-#ifdef USE_OPENCV
-  vector<int> InferBlobShape(const vector<cv::Mat> & mat_vector);
-  /**
-   * @brief Infers the shape of transformed_blob will have when
-   *    the transformation is applied to the data.
-   *
-   * @param cv_img
-   *    cv::Mat containing the data to be transformed.
-   */
-  vector<int> InferBlobShape(const cv::Mat& cv_img);
-#endif  // USE_OPENCV
-
- protected:
-   /**
-   * @brief Generates a random integer from Uniform({0, 1, ..., n-1}).
-   *
-   * @param n
-   *    The upperbound (exclusive) value of the random number.
-   * @return
-   *    A uniformly random integer value from ({0, 1, ..., n-1}).
-   */
-  virtual int Rand(int n);
-
-  void Transform(const Datum& datum, Dtype* transformed_data);
-  // Tranformation parameters
-  TransformationParameter param_;
-
-
-  shared_ptr<Caffe::RNG> rng_;
-  Phase phase_;
-  Blob<Dtype> data_mean_;
-  vector<Dtype> mean_values_;
-};
-
-}  // namespace caffe
-
-#endif  // CAFFE_DATA_TRANSFORMER_HPP_
diff --git a/include/caffe/internal_thread.hpp b/include/caffe/internal_thread.hpp
deleted file mode 100644
index 6a8c5a0..0000000
--- a/include/caffe/internal_thread.hpp
+++ /dev/null
@@ -1,53 +0,0 @@
-#ifndef CAFFE_INTERNAL_THREAD_HPP_
-#define CAFFE_INTERNAL_THREAD_HPP_
-
-#include "caffe/common.hpp"
-
-/**
- Forward declare boost::thread instead of including boost/thread.hpp
- to avoid a boost/NVCC issues (#1009, #1010) on OSX.
- */
-namespace boost { class thread; }
-
-namespace caffe {
-
-/**
- * Virtual class encapsulate boost::thread for use in base class
- * The child class will acquire the ability to run a single thread,
- * by reimplementing the virtual function InternalThreadEntry.
- */
-class InternalThread {
- public:
-  InternalThread() : thread_() {}
-  virtual ~InternalThread();
-
-  /**
-   * Caffe's thread local state will be initialized using the current
-   * thread values, e.g. device id, solver index etc. The random seed
-   * is initialized using caffe_rng_rand.
-   */
-  void StartInternalThread();
-
-  /** Will not return until the internal thread has exited. */
-  void StopInternalThread();
-
-  bool is_started() const;
-
- protected:
-  /* Implement this method in your subclass
-      with the code you want your thread to run. */
-  virtual void InternalThreadEntry() {}
-
-  /* Should be tested when running loops to exit when requested. */
-  bool must_stop();
-
- private:
-  void entry(int device, Caffe::Brew mode, int rand_seed, int solver_count,
-      bool root_solver);
-
-  shared_ptr<boost::thread> thread_;
-};
-
-}  // namespace caffe
-
-#endif  // CAFFE_INTERNAL_THREAD_HPP_
diff --git a/include/caffe/layer.hpp b/include/caffe/layer.hpp
index 10f353f..e2abe61 100644
--- a/include/caffe/layer.hpp
+++ b/include/caffe/layer.hpp
@@ -11,12 +11,6 @@
 #include "caffe/proto/caffe.pb.h"
 #include "caffe/util/math_functions.hpp"
 
-/**
- Forward declare boost::thread instead of including boost/thread.hpp
- to avoid a boost/NVCC issues (#1009, #1010) on OSX.
- */
-namespace boost { class mutex; }
-
 namespace caffe {
 
 /**
@@ -66,7 +60,6 @@ class Layer {
    */
   void SetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
-    InitMutex();
     CheckBlobCounts(bottom, top);
     LayerSetUp(bottom, top);
     Reshape(bottom, top);
@@ -334,15 +327,6 @@ class Layer {
   /** @brief Using the CPU device, compute the layer output. */
   virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) = 0;
-  /**
-   * @brief Using the GPU device, compute the layer output.
-   *        Fall back to Forward_cpu() if unavailable.
-   */
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-    // LOG(WARNING) << "Using CPU code as backup.";
-    return Forward_cpu(bottom, top);
-  }
 
   /**
    * @brief Using the CPU device, compute the gradients for any parameters and
@@ -351,17 +335,6 @@ class Layer {
   virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down,
       const vector<Blob<Dtype>*>& bottom) = 0;
-  /**
-   * @brief Using the GPU device, compute the gradients for any parameters and
-   *        for the bottom blobs if propagate_down is true.
-   *        Fall back to Backward_cpu() if unavailable.
-   */
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down,
-      const vector<Blob<Dtype>*>& bottom) {
-    // LOG(WARNING) << "Using CPU code as backup.";
-    Backward_cpu(top, propagate_down, bottom);
-  }
 
   /**
    * Called by the parent Layer's SetUp to check that the number of bottom
@@ -431,16 +404,6 @@ class Layer {
   /** Whether this layer is actually shared by other nets*/
   bool is_shared_;
 
-  /** The mutex for sequential forward if this layer is shared */
-  shared_ptr<boost::mutex> forward_mutex_;
-
-  /** Initialize forward_mutex_ */
-  void InitMutex();
-  /** Lock forward_mutex_ if this layer is shared */
-  void Lock();
-  /** Unlock forward_mutex_ if this layer is shared */
-  void Unlock();
-
   DISABLE_COPY_AND_ASSIGN(Layer);
 };  // class Layer
 
@@ -450,39 +413,16 @@ class Layer {
 template <typename Dtype>
 inline Dtype Layer<Dtype>::Forward(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
-  // Lock during forward to ensure sequential forward
-  Lock();
   Dtype loss = 0;
   Reshape(bottom, top);
-  switch (Caffe::mode()) {
-  case Caffe::CPU:
-    Forward_cpu(bottom, top);
-    for (int top_id = 0; top_id < top.size(); ++top_id) {
-      if (!this->loss(top_id)) { continue; }
-      const int count = top[top_id]->count();
-      const Dtype* data = top[top_id]->cpu_data();
-      const Dtype* loss_weights = top[top_id]->cpu_diff();
-      loss += caffe_cpu_dot(count, data, loss_weights);
-    }
-    break;
-  case Caffe::GPU:
-    Forward_gpu(bottom, top);
-#ifndef CPU_ONLY
-    for (int top_id = 0; top_id < top.size(); ++top_id) {
-      if (!this->loss(top_id)) { continue; }
-      const int count = top[top_id]->count();
-      const Dtype* data = top[top_id]->gpu_data();
-      const Dtype* loss_weights = top[top_id]->gpu_diff();
-      Dtype blob_loss = 0;
-      caffe_gpu_dot(count, data, loss_weights, &blob_loss);
-      loss += blob_loss;
-    }
-#endif
-    break;
-  default:
-    LOG(FATAL) << "Unknown caffe mode.";
+  Forward_cpu(bottom, top);
+  for (int top_id = 0; top_id < top.size(); ++top_id) {
+    if (!this->loss(top_id)) { continue; }
+    const int count = top[top_id]->count();
+    const Dtype* data = top[top_id]->cpu_data();
+    const Dtype* loss_weights = top[top_id]->cpu_diff();
+    loss += caffe_cpu_dot(count, data, loss_weights);
   }
-  Unlock();
   return loss;
 }
 
@@ -490,16 +430,7 @@ template <typename Dtype>
 inline void Layer<Dtype>::Backward(const vector<Blob<Dtype>*>& top,
     const vector<bool>& propagate_down,
     const vector<Blob<Dtype>*>& bottom) {
-  switch (Caffe::mode()) {
-  case Caffe::CPU:
-    Backward_cpu(top, propagate_down, bottom);
-    break;
-  case Caffe::GPU:
-    Backward_gpu(top, propagate_down, bottom);
-    break;
-  default:
-    LOG(FATAL) << "Unknown caffe mode.";
-  }
+  Backward_cpu(top, propagate_down, bottom);
 }
 
 // Serialize LayerParameter to protocol buffer
diff --git a/include/caffe/layer_factory.hpp b/include/caffe/layer_factory.hpp
index f385afc..acc5d91 100644
--- a/include/caffe/layer_factory.hpp
+++ b/include/caffe/layer_factory.hpp
@@ -73,9 +73,6 @@ class LayerRegistry {
 
   // Get a layer using a LayerParameter.
   static shared_ptr<Layer<Dtype> > CreateLayer(const LayerParameter& param) {
-    if (Caffe::root_solver()) {
-      LOG(INFO) << "Creating layer " << param.name();
-    }
     const string& type = param.type();
     CreatorRegistry& registry = Registry();
     CHECK_EQ(registry.count(type), 1) << "Unknown layer type: " << type
diff --git a/include/caffe/layers/absval_layer.hpp b/include/caffe/layers/absval_layer.hpp
index 9b5305d..beef217 100644
--- a/include/caffe/layers/absval_layer.hpp
+++ b/include/caffe/layers/absval_layer.hpp
@@ -37,8 +37,6 @@ class AbsValLayer : public NeuronLayer<Dtype> {
   /// @copydoc AbsValLayer
   virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
 
   /**
    * @brief Computes the error gradient w.r.t. the absolute value inputs.
@@ -59,8 +57,6 @@ class AbsValLayer : public NeuronLayer<Dtype> {
    */
   virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 };
 
 }  // namespace caffe
diff --git a/include/caffe/layers/base_conv_layer.hpp b/include/caffe/layers/base_conv_layer.hpp
index 0160a83..8b1154f 100644
--- a/include/caffe/layers/base_conv_layer.hpp
+++ b/include/caffe/layers/base_conv_layer.hpp
@@ -41,17 +41,6 @@ class BaseConvolutionLayer : public Layer<Dtype> {
       weights);
   void backward_cpu_bias(Dtype* bias, const Dtype* input);
 
-#ifndef CPU_ONLY
-  void forward_gpu_gemm(const Dtype* col_input, const Dtype* weights,
-      Dtype* output, bool skip_im2col = false);
-  void forward_gpu_bias(Dtype* output, const Dtype* bias);
-  void backward_gpu_gemm(const Dtype* input, const Dtype* weights,
-      Dtype* col_output);
-  void weight_gpu_gemm(const Dtype* col_input, const Dtype* output, Dtype*
-      weights);
-  void backward_gpu_bias(Dtype* bias, const Dtype* input);
-#endif
-
   /// @brief The spatial dimensions of the input.
   inline int input_shape(int i) {
     return (*bottom_shape_)[channel_axis_ + i];
@@ -123,38 +112,6 @@ class BaseConvolutionLayer : public Layer<Dtype> {
           pad_.cpu_data(), stride_.cpu_data(), dilation_.cpu_data(), data);
     }
   }
-#ifndef CPU_ONLY
-  inline void conv_im2col_gpu(const Dtype* data, Dtype* col_buff) {
-    if (!force_nd_im2col_ && num_spatial_axes_ == 2) {
-      im2col_gpu(data, conv_in_channels_,
-          conv_input_shape_.cpu_data()[1], conv_input_shape_.cpu_data()[2],
-          kernel_shape_.cpu_data()[0], kernel_shape_.cpu_data()[1],
-          pad_.cpu_data()[0], pad_.cpu_data()[1],
-          stride_.cpu_data()[0], stride_.cpu_data()[1],
-          dilation_.cpu_data()[0], dilation_.cpu_data()[1], col_buff);
-    } else {
-      im2col_nd_gpu(data, num_spatial_axes_, num_kernels_im2col_,
-          conv_input_shape_.gpu_data(), col_buffer_.gpu_shape(),
-          kernel_shape_.gpu_data(), pad_.gpu_data(),
-          stride_.gpu_data(), dilation_.gpu_data(), col_buff);
-    }
-  }
-  inline void conv_col2im_gpu(const Dtype* col_buff, Dtype* data) {
-    if (!force_nd_im2col_ && num_spatial_axes_ == 2) {
-      col2im_gpu(col_buff, conv_in_channels_,
-          conv_input_shape_.cpu_data()[1], conv_input_shape_.cpu_data()[2],
-          kernel_shape_.cpu_data()[0], kernel_shape_.cpu_data()[1],
-          pad_.cpu_data()[0], pad_.cpu_data()[1],
-          stride_.cpu_data()[0], stride_.cpu_data()[1],
-          dilation_.cpu_data()[0], dilation_.cpu_data()[1], data);
-    } else {
-      col2im_nd_gpu(col_buff, num_spatial_axes_, num_kernels_col2im_,
-          conv_input_shape_.gpu_data(), col_buffer_.gpu_shape(),
-          kernel_shape_.gpu_data(), pad_.gpu_data(), stride_.gpu_data(),
-          dilation_.gpu_data(), data);
-    }
-  }
-#endif
 
   int num_kernels_im2col_;
   int num_kernels_col2im_;
diff --git a/include/caffe/layers/base_data_layer.hpp b/include/caffe/layers/base_data_layer.hpp
deleted file mode 100644
index 2c49b73..0000000
--- a/include/caffe/layers/base_data_layer.hpp
+++ /dev/null
@@ -1,86 +0,0 @@
-#ifndef CAFFE_DATA_LAYERS_HPP_
-#define CAFFE_DATA_LAYERS_HPP_
-
-#include <vector>
-
-#include "caffe/blob.hpp"
-#include "caffe/data_transformer.hpp"
-#include "caffe/internal_thread.hpp"
-#include "caffe/layer.hpp"
-#include "caffe/proto/caffe.pb.h"
-#include "caffe/util/blocking_queue.hpp"
-
-namespace caffe {
-
-/**
- * @brief Provides base for data layers that feed blobs to the Net.
- *
- * TODO(dox): thorough documentation for Forward and proto params.
- */
-template <typename Dtype>
-class BaseDataLayer : public Layer<Dtype> {
- public:
-  explicit BaseDataLayer(const LayerParameter& param);
-  // LayerSetUp: implements common data layer setup functionality, and calls
-  // DataLayerSetUp to do special data layer setup for individual layer types.
-  // This method may not be overridden except by the BasePrefetchingDataLayer.
-  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  // Data layers should be shared by multiple solvers in parallel
-  virtual inline bool ShareInParallel() const { return true; }
-  virtual void DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {}
-  // Data layers have no bottoms, so reshaping is trivial.
-  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {}
-
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {}
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {}
-
- protected:
-  TransformationParameter transform_param_;
-  shared_ptr<DataTransformer<Dtype> > data_transformer_;
-  bool output_labels_;
-};
-
-template <typename Dtype>
-class Batch {
- public:
-  Blob<Dtype> data_, label_;
-};
-
-template <typename Dtype>
-class BasePrefetchingDataLayer :
-    public BaseDataLayer<Dtype>, public InternalThread {
- public:
-  explicit BasePrefetchingDataLayer(const LayerParameter& param);
-  // LayerSetUp: implements common data layer setup functionality, and calls
-  // DataLayerSetUp to do special data layer setup for individual layer types.
-  // This method may not be overridden.
-  void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  // Prefetches batches (asynchronously if to GPU memory)
-  static const int PREFETCH_COUNT = 3;
-
- protected:
-  virtual void InternalThreadEntry();
-  virtual void load_batch(Batch<Dtype>* batch) = 0;
-
-  Batch<Dtype> prefetch_[PREFETCH_COUNT];
-  BlockingQueue<Batch<Dtype>*> prefetch_free_;
-  BlockingQueue<Batch<Dtype>*> prefetch_full_;
-
-  Blob<Dtype> transformed_data_;
-};
-
-}  // namespace caffe
-
-#endif  // CAFFE_DATA_LAYERS_HPP_
diff --git a/include/caffe/layers/batch_norm_layer.hpp b/include/caffe/layers/batch_norm_layer.hpp
index 9b2d512..5cd82cb 100644
--- a/include/caffe/layers/batch_norm_layer.hpp
+++ b/include/caffe/layers/batch_norm_layer.hpp
@@ -56,12 +56,8 @@ class BatchNormLayer : public Layer<Dtype> {
  protected:
   virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
   virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-     const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
   Blob<Dtype> mean_, variance_, temp_, x_norm_;
   bool use_global_stats_;
diff --git a/include/caffe/layers/batch_reindex_layer.hpp b/include/caffe/layers/batch_reindex_layer.hpp
index ebb3a56..e420e28 100644
--- a/include/caffe/layers/batch_reindex_layer.hpp
+++ b/include/caffe/layers/batch_reindex_layer.hpp
@@ -44,8 +44,6 @@ class BatchReindexLayer : public Layer<Dtype> {
    */
   virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
 
   /**
    * @brief Computes the error gradient w.r.t. the reordered input.
@@ -64,8 +62,6 @@ class BatchReindexLayer : public Layer<Dtype> {
    */
   virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
  private:
   struct pair_sort_first {
diff --git a/include/caffe/layers/bias_layer.hpp b/include/caffe/layers/bias_layer.hpp
index eedc3aa..d5ba0fe 100644
--- a/include/caffe/layers/bias_layer.hpp
+++ b/include/caffe/layers/bias_layer.hpp
@@ -35,12 +35,8 @@ class BiasLayer : public Layer<Dtype> {
 
   virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
   virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
  private:
   Blob<Dtype> bias_multiplier_;
diff --git a/include/caffe/layers/bnll_layer.hpp b/include/caffe/layers/bnll_layer.hpp
index be07c74..55384bf 100644
--- a/include/caffe/layers/bnll_layer.hpp
+++ b/include/caffe/layers/bnll_layer.hpp
@@ -40,8 +40,6 @@ class BNLLLayer : public NeuronLayer<Dtype> {
   /// @copydoc BNLLLayer
   virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
 
   /**
    * @brief Computes the error gradient w.r.t. the BNLL inputs.
@@ -61,8 +59,6 @@ class BNLLLayer : public NeuronLayer<Dtype> {
    */
   virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 };
 
 }  // namespace caffe
diff --git a/include/caffe/layers/concat_layer.hpp b/include/caffe/layers/concat_layer.hpp
index a157024..6c405ae 100644
--- a/include/caffe/layers/concat_layer.hpp
+++ b/include/caffe/layers/concat_layer.hpp
@@ -46,8 +46,6 @@ class ConcatLayer : public Layer<Dtype> {
    */
   virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
 
   /**
    * @brief Computes the error gradient w.r.t. the concatenate inputs.
@@ -73,8 +71,6 @@ class ConcatLayer : public Layer<Dtype> {
    */
   virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
   int count_;
   int num_concats_;
diff --git a/include/caffe/layers/contrastive_loss_layer.hpp b/include/caffe/layers/contrastive_loss_layer.hpp
index e890afb..8c36170 100644
--- a/include/caffe/layers/contrastive_loss_layer.hpp
+++ b/include/caffe/layers/contrastive_loss_layer.hpp
@@ -57,8 +57,6 @@ class ContrastiveLossLayer : public LossLayer<Dtype> {
   /// @copydoc ContrastiveLossLayer
   virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
 
   /**
    * @brief Computes the Contrastive error gradient w.r.t. the inputs.
@@ -87,8 +85,6 @@ class ContrastiveLossLayer : public LossLayer<Dtype> {
    */
   virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
   Blob<Dtype> diff_;  // cached for backward pass
   Blob<Dtype> dist_sq_;  // cached for backward pass
diff --git a/include/caffe/layers/conv_layer.hpp b/include/caffe/layers/conv_layer.hpp
index 93a618d..eeb6faf 100644
--- a/include/caffe/layers/conv_layer.hpp
+++ b/include/caffe/layers/conv_layer.hpp
@@ -69,12 +69,8 @@ class ConvolutionLayer : public BaseConvolutionLayer<Dtype> {
  protected:
   virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
   virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
   virtual inline bool reverse_dimensions() { return false; }
   virtual void compute_output_shape();
 };
diff --git a/include/caffe/layers/crop_layer.hpp b/include/caffe/layers/crop_layer.hpp
index c4fda12..ad65b66 100644
--- a/include/caffe/layers/crop_layer.hpp
+++ b/include/caffe/layers/crop_layer.hpp
@@ -36,10 +36,6 @@ class CropLayer : public Layer<Dtype> {
       const vector<Blob<Dtype>*>& top);
   virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
   vector<int> offsets;
 
diff --git a/include/caffe/layers/cudnn_conv_layer.hpp b/include/caffe/layers/cudnn_conv_layer.hpp
deleted file mode 100644
index 31fe49a..0000000
--- a/include/caffe/layers/cudnn_conv_layer.hpp
+++ /dev/null
@@ -1,72 +0,0 @@
-#ifndef CAFFE_CUDNN_CONV_LAYER_HPP_
-#define CAFFE_CUDNN_CONV_LAYER_HPP_
-
-#include <vector>
-
-#include "caffe/blob.hpp"
-#include "caffe/layer.hpp"
-#include "caffe/proto/caffe.pb.h"
-
-#include "caffe/layers/conv_layer.hpp"
-
-namespace caffe {
-
-#ifdef USE_CUDNN
-/*
- * @brief cuDNN implementation of ConvolutionLayer.
- *        Fallback to ConvolutionLayer for CPU mode.
- *
- * cuDNN accelerates convolution through forward kernels for filtering and bias
- * plus backward kernels for the gradient w.r.t. the filters, biases, and
- * inputs. Caffe + cuDNN further speeds up the computation through forward
- * parallelism across groups and backward parallelism across gradients.
- *
- * The CUDNN engine does not have memory overhead for matrix buffers. For many
- * input and filter regimes the CUDNN engine is faster than the CAFFE engine,
- * but for fully-convolutional models and large inputs the CAFFE engine can be
- * faster as long as it fits in memory.
-*/
-template <typename Dtype>
-class CuDNNConvolutionLayer : public ConvolutionLayer<Dtype> {
- public:
-  explicit CuDNNConvolutionLayer(const LayerParameter& param)
-      : ConvolutionLayer<Dtype>(param), handles_setup_(false) {}
-  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual ~CuDNNConvolutionLayer();
-
- protected:
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-  bool handles_setup_;
-  cudnnHandle_t* handle_;
-  cudaStream_t*  stream_;
-
-  // algorithms for forward and backwards convolutions
-  cudnnConvolutionFwdAlgo_t *fwd_algo_;
-  cudnnConvolutionBwdFilterAlgo_t *bwd_filter_algo_;
-  cudnnConvolutionBwdDataAlgo_t *bwd_data_algo_;
-
-  vector<cudnnTensorDescriptor_t> bottom_descs_, top_descs_;
-  cudnnTensorDescriptor_t    bias_desc_;
-  cudnnFilterDescriptor_t      filter_desc_;
-  vector<cudnnConvolutionDescriptor_t> conv_descs_;
-  int bottom_offset_, top_offset_, bias_offset_;
-
-  size_t *workspace_fwd_sizes_;
-  size_t *workspace_bwd_data_sizes_;
-  size_t *workspace_bwd_filter_sizes_;
-  size_t workspaceSizeInBytes;  // size of underlying storage
-  void *workspaceData;  // underlying storage
-  void **workspace;  // aliases into workspaceData
-};
-#endif
-
-}  // namespace caffe
-
-#endif  // CAFFE_CUDNN_CONV_LAYER_HPP_
diff --git a/include/caffe/layers/cudnn_lcn_layer.hpp b/include/caffe/layers/cudnn_lcn_layer.hpp
deleted file mode 100644
index 74cf477..0000000
--- a/include/caffe/layers/cudnn_lcn_layer.hpp
+++ /dev/null
@@ -1,49 +0,0 @@
-#ifndef CAFFE_CUDNN_LCN_LAYER_HPP_
-#define CAFFE_CUDNN_LCN_LAYER_HPP_
-
-#include <vector>
-
-#include "caffe/blob.hpp"
-#include "caffe/layer.hpp"
-#include "caffe/proto/caffe.pb.h"
-
-#include "caffe/layers/lrn_layer.hpp"
-#include "caffe/layers/power_layer.hpp"
-
-namespace caffe {
-
-#ifdef USE_CUDNN
-template <typename Dtype>
-class CuDNNLCNLayer : public LRNLayer<Dtype> {
- public:
-  explicit CuDNNLCNLayer(const LayerParameter& param)
-      : LRNLayer<Dtype>(param), handles_setup_(false), tempDataSize(0),
-        tempData1(NULL), tempData2(NULL) {}
-  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual ~CuDNNLCNLayer();
-
- protected:
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-  bool handles_setup_;
-  cudnnHandle_t             handle_;
-  cudnnLRNDescriptor_t norm_desc_;
-  cudnnTensorDescriptor_t bottom_desc_, top_desc_;
-
-  int size_, pre_pad_;
-  Dtype alpha_, beta_, k_;
-
-  size_t tempDataSize;
-  void *tempData1, *tempData2;
-};
-#endif
-
-}  // namespace caffe
-
-#endif  // CAFFE_CUDNN_LCN_LAYER_HPP_
diff --git a/include/caffe/layers/cudnn_lrn_layer.hpp b/include/caffe/layers/cudnn_lrn_layer.hpp
deleted file mode 100644
index 000ccc3..0000000
--- a/include/caffe/layers/cudnn_lrn_layer.hpp
+++ /dev/null
@@ -1,44 +0,0 @@
-#ifndef CAFFE_CUDNN_LRN_LAYER_HPP_
-#define CAFFE_CUDNN_LRN_LAYER_HPP_
-
-#include <vector>
-
-#include "caffe/blob.hpp"
-#include "caffe/layer.hpp"
-#include "caffe/proto/caffe.pb.h"
-
-#include "caffe/layers/lrn_layer.hpp"
-
-namespace caffe {
-
-#ifdef USE_CUDNN
-template <typename Dtype>
-class CuDNNLRNLayer : public LRNLayer<Dtype> {
- public:
-  explicit CuDNNLRNLayer(const LayerParameter& param)
-      : LRNLayer<Dtype>(param), handles_setup_(false) {}
-  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual ~CuDNNLRNLayer();
-
- protected:
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-  bool handles_setup_;
-  cudnnHandle_t             handle_;
-  cudnnLRNDescriptor_t norm_desc_;
-  cudnnTensorDescriptor_t bottom_desc_, top_desc_;
-
-  int size_;
-  Dtype alpha_, beta_, k_;
-};
-#endif
-
-}  // namespace caffe
-
-#endif  // CAFFE_CUDNN_LRN_LAYER_HPP_
diff --git a/include/caffe/layers/cudnn_pooling_layer.hpp b/include/caffe/layers/cudnn_pooling_layer.hpp
deleted file mode 100644
index 6d0db47..0000000
--- a/include/caffe/layers/cudnn_pooling_layer.hpp
+++ /dev/null
@@ -1,49 +0,0 @@
-#ifndef CAFFE_CUDNN_POOLING_LAYER_HPP_
-#define CAFFE_CUDNN_POOLING_LAYER_HPP_
-
-#include <vector>
-
-#include "caffe/blob.hpp"
-#include "caffe/layer.hpp"
-#include "caffe/proto/caffe.pb.h"
-
-#include "caffe/layers/pooling_layer.hpp"
-
-namespace caffe {
-
-#ifdef USE_CUDNN
-/*
- * @brief cuDNN implementation of PoolingLayer.
- *        Fallback to PoolingLayer for CPU mode.
-*/
-template <typename Dtype>
-class CuDNNPoolingLayer : public PoolingLayer<Dtype> {
- public:
-  explicit CuDNNPoolingLayer(const LayerParameter& param)
-      : PoolingLayer<Dtype>(param), handles_setup_(false) {}
-  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual ~CuDNNPoolingLayer();
-  // Currently, cuDNN does not support the extra top blob.
-  virtual inline int MinTopBlobs() const { return -1; }
-  virtual inline int ExactNumTopBlobs() const { return 1; }
-
- protected:
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-  bool handles_setup_;
-  cudnnHandle_t             handle_;
-  cudnnTensorDescriptor_t bottom_desc_, top_desc_;
-  cudnnPoolingDescriptor_t  pooling_desc_;
-  cudnnPoolingMode_t        mode_;
-};
-#endif
-
-}  // namespace caffe
-
-#endif  // CAFFE_CUDNN_POOLING_LAYER_HPP_
diff --git a/include/caffe/layers/cudnn_relu_layer.hpp b/include/caffe/layers/cudnn_relu_layer.hpp
deleted file mode 100644
index a1cb29e..0000000
--- a/include/caffe/layers/cudnn_relu_layer.hpp
+++ /dev/null
@@ -1,46 +0,0 @@
-#ifndef CAFFE_CUDNN_RELU_LAYER_HPP_
-#define CAFFE_CUDNN_RELU_LAYER_HPP_
-
-#include <vector>
-
-#include "caffe/blob.hpp"
-#include "caffe/layer.hpp"
-#include "caffe/proto/caffe.pb.h"
-
-#include "caffe/layers/neuron_layer.hpp"
-#include "caffe/layers/relu_layer.hpp"
-
-namespace caffe {
-
-#ifdef USE_CUDNN
-/**
- * @brief CuDNN acceleration of ReLULayer.
- */
-template <typename Dtype>
-class CuDNNReLULayer : public ReLULayer<Dtype> {
- public:
-  explicit CuDNNReLULayer(const LayerParameter& param)
-      : ReLULayer<Dtype>(param), handles_setup_(false) {}
-  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual ~CuDNNReLULayer();
-
- protected:
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-  bool handles_setup_;
-  cudnnHandle_t             handle_;
-  cudnnTensorDescriptor_t bottom_desc_;
-  cudnnTensorDescriptor_t top_desc_;
-  cudnnActivationDescriptor_t activ_desc_;
-};
-#endif
-
-}  // namespace caffe
-
-#endif  // CAFFE_CUDNN_RELU_LAYER_HPP_
diff --git a/include/caffe/layers/cudnn_sigmoid_layer.hpp b/include/caffe/layers/cudnn_sigmoid_layer.hpp
deleted file mode 100644
index 7b3486f..0000000
--- a/include/caffe/layers/cudnn_sigmoid_layer.hpp
+++ /dev/null
@@ -1,46 +0,0 @@
-#ifndef CAFFE_CUDNN_SIGMOID_LAYER_HPP_
-#define CAFFE_CUDNN_SIGMOID_LAYER_HPP_
-
-#include <vector>
-
-#include "caffe/blob.hpp"
-#include "caffe/layer.hpp"
-#include "caffe/proto/caffe.pb.h"
-
-#include "caffe/layers/neuron_layer.hpp"
-#include "caffe/layers/sigmoid_layer.hpp"
-
-namespace caffe {
-
-#ifdef USE_CUDNN
-/**
- * @brief CuDNN acceleration of SigmoidLayer.
- */
-template <typename Dtype>
-class CuDNNSigmoidLayer : public SigmoidLayer<Dtype> {
- public:
-  explicit CuDNNSigmoidLayer(const LayerParameter& param)
-      : SigmoidLayer<Dtype>(param), handles_setup_(false) {}
-  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual ~CuDNNSigmoidLayer();
-
- protected:
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-  bool handles_setup_;
-  cudnnHandle_t             handle_;
-  cudnnTensorDescriptor_t bottom_desc_;
-  cudnnTensorDescriptor_t top_desc_;
-  cudnnActivationDescriptor_t activ_desc_;
-};
-#endif
-
-}  // namespace caffe
-
-#endif  // CAFFE_CUDNN_SIGMOID_LAYER_HPP_
diff --git a/include/caffe/layers/cudnn_softmax_layer.hpp b/include/caffe/layers/cudnn_softmax_layer.hpp
deleted file mode 100644
index 174368e..0000000
--- a/include/caffe/layers/cudnn_softmax_layer.hpp
+++ /dev/null
@@ -1,45 +0,0 @@
-#ifndef CAFFE_CUDNN_SOFTMAX_LAYER_HPP_
-#define CAFFE_CUDNN_SOFTMAX_LAYER_HPP_
-
-#include <vector>
-
-#include "caffe/blob.hpp"
-#include "caffe/layer.hpp"
-#include "caffe/proto/caffe.pb.h"
-
-#include "caffe/layers/softmax_layer.hpp"
-
-namespace caffe {
-
-#ifdef USE_CUDNN
-/**
- * @brief cuDNN implementation of SoftmaxLayer.
- *        Fallback to SoftmaxLayer for CPU mode.
- */
-template <typename Dtype>
-class CuDNNSoftmaxLayer : public SoftmaxLayer<Dtype> {
- public:
-  explicit CuDNNSoftmaxLayer(const LayerParameter& param)
-      : SoftmaxLayer<Dtype>(param), handles_setup_(false) {}
-  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual ~CuDNNSoftmaxLayer();
-
- protected:
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-     const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-  bool handles_setup_;
-  cudnnHandle_t             handle_;
-  cudnnTensorDescriptor_t bottom_desc_;
-  cudnnTensorDescriptor_t top_desc_;
-};
-#endif
-
-}  // namespace caffe
-
-#endif  // CAFFE_CUDNN_SOFTMAX_LAYER_HPP_
diff --git a/include/caffe/layers/cudnn_tanh_layer.hpp b/include/caffe/layers/cudnn_tanh_layer.hpp
deleted file mode 100644
index 59e758d..0000000
--- a/include/caffe/layers/cudnn_tanh_layer.hpp
+++ /dev/null
@@ -1,46 +0,0 @@
-#ifndef CAFFE_CUDNN_TANH_LAYER_HPP_
-#define CAFFE_CUDNN_TANH_LAYER_HPP_
-
-#include <vector>
-
-#include "caffe/blob.hpp"
-#include "caffe/layer.hpp"
-#include "caffe/proto/caffe.pb.h"
-
-#include "caffe/layers/neuron_layer.hpp"
-#include "caffe/layers/tanh_layer.hpp"
-
-namespace caffe {
-
-#ifdef USE_CUDNN
-/**
- * @brief CuDNN acceleration of TanHLayer.
- */
-template <typename Dtype>
-class CuDNNTanHLayer : public TanHLayer<Dtype> {
- public:
-  explicit CuDNNTanHLayer(const LayerParameter& param)
-      : TanHLayer<Dtype>(param), handles_setup_(false) {}
-  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual ~CuDNNTanHLayer();
-
- protected:
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-  bool handles_setup_;
-  cudnnHandle_t             handle_;
-  cudnnTensorDescriptor_t bottom_desc_;
-  cudnnTensorDescriptor_t top_desc_;
-  cudnnActivationDescriptor_t activ_desc_;
-};
-#endif
-
-}  // namespace caffe
-
-#endif  // CAFFE_CUDNN_TANH_LAYER_HPP_
diff --git a/include/caffe/layers/data_layer.hpp b/include/caffe/layers/data_layer.hpp
deleted file mode 100644
index 6c36179..0000000
--- a/include/caffe/layers/data_layer.hpp
+++ /dev/null
@@ -1,39 +0,0 @@
-#ifndef CAFFE_DATA_LAYER_HPP_
-#define CAFFE_DATA_LAYER_HPP_
-
-#include <vector>
-
-#include "caffe/blob.hpp"
-#include "caffe/data_reader.hpp"
-#include "caffe/data_transformer.hpp"
-#include "caffe/internal_thread.hpp"
-#include "caffe/layer.hpp"
-#include "caffe/layers/base_data_layer.hpp"
-#include "caffe/proto/caffe.pb.h"
-#include "caffe/util/db.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-class DataLayer : public BasePrefetchingDataLayer<Dtype> {
- public:
-  explicit DataLayer(const LayerParameter& param);
-  virtual ~DataLayer();
-  virtual void DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  // DataLayer uses DataReader instead for sharing for parallelism
-  virtual inline bool ShareInParallel() const { return false; }
-  virtual inline const char* type() const { return "Data"; }
-  virtual inline int ExactNumBottomBlobs() const { return 0; }
-  virtual inline int MinTopBlobs() const { return 1; }
-  virtual inline int MaxTopBlobs() const { return 2; }
-
- protected:
-  virtual void load_batch(Batch<Dtype>* batch);
-
-  DataReader reader_;
-};
-
-}  // namespace caffe
-
-#endif  // CAFFE_DATA_LAYER_HPP_
diff --git a/include/caffe/layers/deconv_layer.hpp b/include/caffe/layers/deconv_layer.hpp
index 23ae887..0bd9578 100644
--- a/include/caffe/layers/deconv_layer.hpp
+++ b/include/caffe/layers/deconv_layer.hpp
@@ -36,12 +36,8 @@ class DeconvolutionLayer : public BaseConvolutionLayer<Dtype> {
  protected:
   virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
   virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
   virtual inline bool reverse_dimensions() { return true; }
   virtual void compute_output_shape();
 };
diff --git a/include/caffe/layers/dropout_layer.hpp b/include/caffe/layers/dropout_layer.hpp
index e83143b..c08f42f 100644
--- a/include/caffe/layers/dropout_layer.hpp
+++ b/include/caffe/layers/dropout_layer.hpp
@@ -59,12 +59,8 @@ class DropoutLayer : public NeuronLayer<Dtype> {
    */
   virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
   virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
   /// when divided by UINT_MAX, the randomly generated values @f$u\sim U(0,1)@f$
   Blob<unsigned int> rand_vec_;
diff --git a/include/caffe/layers/dummy_data_layer.hpp b/include/caffe/layers/dummy_data_layer.hpp
deleted file mode 100644
index 4180f1d..0000000
--- a/include/caffe/layers/dummy_data_layer.hpp
+++ /dev/null
@@ -1,49 +0,0 @@
-#ifndef CAFFE_DUMMY_DATA_LAYER_HPP_
-#define CAFFE_DUMMY_DATA_LAYER_HPP_
-
-#include <vector>
-
-#include "caffe/blob.hpp"
-#include "caffe/filler.hpp"
-#include "caffe/layer.hpp"
-#include "caffe/proto/caffe.pb.h"
-
-namespace caffe {
-
-/**
- * @brief Provides data to the Net generated by a Filler.
- *
- * TODO(dox): thorough documentation for Forward and proto params.
- */
-template <typename Dtype>
-class DummyDataLayer : public Layer<Dtype> {
- public:
-  explicit DummyDataLayer(const LayerParameter& param)
-      : Layer<Dtype>(param) {}
-  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  // Data layers should be shared by multiple solvers in parallel
-  virtual inline bool ShareInParallel() const { return true; }
-  // Data layers have no bottoms, so reshaping is trivial.
-  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {}
-
-  virtual inline const char* type() const { return "DummyData"; }
-  virtual inline int ExactNumBottomBlobs() const { return 0; }
-  virtual inline int MinTopBlobs() const { return 1; }
-
- protected:
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {}
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {}
-
-  vector<shared_ptr<Filler<Dtype> > > fillers_;
-  vector<bool> refill_;
-};
-
-}  // namespace caffe
-
-#endif  // CAFFE_DUMMY_DATA_LAYER_HPP_
diff --git a/include/caffe/layers/eltwise_layer.hpp b/include/caffe/layers/eltwise_layer.hpp
index 091de83..3bb3111 100644
--- a/include/caffe/layers/eltwise_layer.hpp
+++ b/include/caffe/layers/eltwise_layer.hpp
@@ -32,12 +32,8 @@ class EltwiseLayer : public Layer<Dtype> {
  protected:
   virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
   virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
   EltwiseParameter_EltwiseOp op_;
   vector<Dtype> coeffs_;
diff --git a/include/caffe/layers/elu_layer.hpp b/include/caffe/layers/elu_layer.hpp
index 0796e89..0efc825 100644
--- a/include/caffe/layers/elu_layer.hpp
+++ b/include/caffe/layers/elu_layer.hpp
@@ -51,8 +51,6 @@ class ELULayer : public NeuronLayer<Dtype> {
    */
   virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
 
   /**
    * @brief Computes the error gradient w.r.t. the ELU inputs.
@@ -76,8 +74,6 @@ class ELULayer : public NeuronLayer<Dtype> {
    */
   virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 };
 
 
diff --git a/include/caffe/layers/embed_layer.hpp b/include/caffe/layers/embed_layer.hpp
index 36137a6..cb729c7 100644
--- a/include/caffe/layers/embed_layer.hpp
+++ b/include/caffe/layers/embed_layer.hpp
@@ -33,12 +33,8 @@ class EmbedLayer : public Layer<Dtype> {
  protected:
   virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
   virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
   int M_;
   int K_;
diff --git a/include/caffe/layers/euclidean_loss_layer.hpp b/include/caffe/layers/euclidean_loss_layer.hpp
index f564569..09d2e39 100644
--- a/include/caffe/layers/euclidean_loss_layer.hpp
+++ b/include/caffe/layers/euclidean_loss_layer.hpp
@@ -58,8 +58,6 @@ class EuclideanLossLayer : public LossLayer<Dtype> {
   /// @copydoc EuclideanLossLayer
   virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
 
   /**
    * @brief Computes the Euclidean error gradient w.r.t. the inputs.
@@ -96,8 +94,6 @@ class EuclideanLossLayer : public LossLayer<Dtype> {
    */
   virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
   Blob<Dtype> diff_;
 };
diff --git a/include/caffe/layers/exp_layer.hpp b/include/caffe/layers/exp_layer.hpp
index 9fc8c39..5ce9027 100644
--- a/include/caffe/layers/exp_layer.hpp
+++ b/include/caffe/layers/exp_layer.hpp
@@ -47,8 +47,6 @@ class ExpLayer : public NeuronLayer<Dtype> {
    */
   virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
 
   /**
    * @brief Computes the error gradient w.r.t. the exp inputs.
@@ -69,8 +67,6 @@ class ExpLayer : public NeuronLayer<Dtype> {
    */
   virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
   Dtype inner_scale_, outer_scale_;
 };
diff --git a/include/caffe/layers/filter_layer.hpp b/include/caffe/layers/filter_layer.hpp
index e040e66..cb0a8bc 100644
--- a/include/caffe/layers/filter_layer.hpp
+++ b/include/caffe/layers/filter_layer.hpp
@@ -51,8 +51,6 @@ class FilterLayer : public Layer<Dtype> {
    */
   virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top);
 
   /**
    * @brief Computes the error gradient w.r.t. the forwarded inputs.
@@ -65,8 +63,6 @@ class FilterLayer : public Layer<Dtype> {
    */
   virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
   bool first_reshape_;
   vector<int> indices_to_forward_;
diff --git a/include/caffe/layers/hdf5_data_layer.hpp b/include/caffe/layers/hdf5_data_layer.hpp
deleted file mode 100644
index b04cf8e..0000000
--- a/include/caffe/layers/hdf5_data_layer.hpp
+++ /dev/null
@@ -1,62 +0,0 @@
-#ifndef CAFFE_HDF5_DATA_LAYER_HPP_
-#define CAFFE_HDF5_DATA_LAYER_HPP_
-
-#include "hdf5.h"
-
-#include <string>
-#include <vector>
-
-#include "caffe/blob.hpp"
-#include "caffe/layer.hpp"
-#include "caffe/proto/caffe.pb.h"
-
-#include "caffe/layers/base_data_layer.hpp"
-
-namespace caffe {
-
-/**
- * @brief Provides data to the Net from HDF5 files.
- *
- * TODO(dox): thorough documentation for Forward and proto params.
- */
-template <typename Dtype>
-class HDF5DataLayer : public Layer<Dtype> {
- public:
-  explicit HDF5DataLayer(const LayerParameter& param)
-      : Layer<Dtype>(param) {}
-  virtual ~HDF5DataLayer();
-  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  // Data layers should be shared by multiple solvers in parallel
-  virtual inline bool ShareInParallel() const { return true; }
-  // Data layers have no bottoms, so reshaping is trivial.
-  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {}
-
-  virtual inline const char* type() const { return "HDF5Data"; }
-  virtual inline int ExactNumBottomBlobs() const { return 0; }
-  virtual inline int MinTopBlobs() const { return 1; }
-
- protected:
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {}
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {}
-  virtual void LoadHDF5FileData(const char* filename);
-
-  std::vector<std::string> hdf_filenames_;
-  unsigned int num_files_;
-  unsigned int current_file_;
-  hsize_t current_row_;
-  std::vector<shared_ptr<Blob<Dtype> > > hdf_blobs_;
-  std::vector<unsigned int> data_permutation_;
-  std::vector<unsigned int> file_permutation_;
-};
-
-}  // namespace caffe
-
-#endif  // CAFFE_HDF5_DATA_LAYER_HPP_
diff --git a/include/caffe/layers/hdf5_output_layer.hpp b/include/caffe/layers/hdf5_output_layer.hpp
deleted file mode 100644
index 487d08f..0000000
--- a/include/caffe/layers/hdf5_output_layer.hpp
+++ /dev/null
@@ -1,64 +0,0 @@
-#ifndef CAFFE_HDF5_OUTPUT_LAYER_HPP_
-#define CAFFE_HDF5_OUTPUT_LAYER_HPP_
-
-#include "hdf5.h"
-
-#include <string>
-#include <vector>
-
-#include "caffe/blob.hpp"
-#include "caffe/layer.hpp"
-#include "caffe/proto/caffe.pb.h"
-
-namespace caffe {
-
-#define HDF5_DATA_DATASET_NAME "data"
-#define HDF5_DATA_LABEL_NAME "label"
-
-/**
- * @brief Write blobs to disk as HDF5 files.
- *
- * TODO(dox): thorough documentation for Forward and proto params.
- */
-template <typename Dtype>
-class HDF5OutputLayer : public Layer<Dtype> {
- public:
-  explicit HDF5OutputLayer(const LayerParameter& param)
-      : Layer<Dtype>(param), file_opened_(false) {}
-  virtual ~HDF5OutputLayer();
-  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  // Data layers should be shared by multiple solvers in parallel
-  virtual inline bool ShareInParallel() const { return true; }
-  // Data layers have no bottoms, so reshaping is trivial.
-  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {}
-
-  virtual inline const char* type() const { return "HDF5Output"; }
-  // TODO: no limit on the number of blobs
-  virtual inline int ExactNumBottomBlobs() const { return 2; }
-  virtual inline int ExactNumTopBlobs() const { return 0; }
-
-  inline std::string file_name() const { return file_name_; }
-
- protected:
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void SaveBlobs();
-
-  bool file_opened_;
-  std::string file_name_;
-  hid_t file_id_;
-  Blob<Dtype> data_blob_;
-  Blob<Dtype> label_blob_;
-};
-
-}  // namespace caffe
-
-#endif  // CAFFE_HDF5_OUTPUT_LAYER_HPP_
diff --git a/include/caffe/layers/im2col_layer.hpp b/include/caffe/layers/im2col_layer.hpp
index 71e32f7..d1d5751 100644
--- a/include/caffe/layers/im2col_layer.hpp
+++ b/include/caffe/layers/im2col_layer.hpp
@@ -33,12 +33,8 @@ class Im2colLayer : public Layer<Dtype> {
  protected:
   virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
   virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
   /// @brief The spatial dimensions of a filter kernel.
   Blob<int> kernel_shape_;
diff --git a/include/caffe/layers/image_data_layer.hpp b/include/caffe/layers/image_data_layer.hpp
deleted file mode 100644
index a0d3384..0000000
--- a/include/caffe/layers/image_data_layer.hpp
+++ /dev/null
@@ -1,47 +0,0 @@
-#ifndef CAFFE_IMAGE_DATA_LAYER_HPP_
-#define CAFFE_IMAGE_DATA_LAYER_HPP_
-
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "caffe/blob.hpp"
-#include "caffe/data_transformer.hpp"
-#include "caffe/internal_thread.hpp"
-#include "caffe/layer.hpp"
-#include "caffe/layers/base_data_layer.hpp"
-#include "caffe/proto/caffe.pb.h"
-
-namespace caffe {
-
-/**
- * @brief Provides data to the Net from image files.
- *
- * TODO(dox): thorough documentation for Forward and proto params.
- */
-template <typename Dtype>
-class ImageDataLayer : public BasePrefetchingDataLayer<Dtype> {
- public:
-  explicit ImageDataLayer(const LayerParameter& param)
-      : BasePrefetchingDataLayer<Dtype>(param) {}
-  virtual ~ImageDataLayer();
-  virtual void DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  virtual inline const char* type() const { return "ImageData"; }
-  virtual inline int ExactNumBottomBlobs() const { return 0; }
-  virtual inline int ExactNumTopBlobs() const { return 2; }
-
- protected:
-  shared_ptr<Caffe::RNG> prefetch_rng_;
-  virtual void ShuffleImages();
-  virtual void load_batch(Batch<Dtype>* batch);
-
-  vector<std::pair<std::string, int> > lines_;
-  int lines_id_;
-};
-
-
-}  // namespace caffe
-
-#endif  // CAFFE_IMAGE_DATA_LAYER_HPP_
diff --git a/include/caffe/layers/inner_product_layer.hpp b/include/caffe/layers/inner_product_layer.hpp
index 18d0d61..8922bb9 100644
--- a/include/caffe/layers/inner_product_layer.hpp
+++ b/include/caffe/layers/inner_product_layer.hpp
@@ -32,12 +32,8 @@ class InnerProductLayer : public Layer<Dtype> {
  protected:
   virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
   virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
   int M_;
   int K_;
diff --git a/include/caffe/layers/log_layer.hpp b/include/caffe/layers/log_layer.hpp
index 7d037d2..fd2bf34 100644
--- a/include/caffe/layers/log_layer.hpp
+++ b/include/caffe/layers/log_layer.hpp
@@ -47,8 +47,6 @@ class LogLayer : public NeuronLayer<Dtype> {
    */
   virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
 
   /**
    * @brief Computes the error gradient w.r.t. the exp inputs.
@@ -69,8 +67,6 @@ class LogLayer : public NeuronLayer<Dtype> {
    */
   virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
   Dtype base_scale_;
   Dtype input_scale_, input_shift_;
diff --git a/include/caffe/layers/lrn_layer.hpp b/include/caffe/layers/lrn_layer.hpp
index 06cf71a..b01b9c5 100644
--- a/include/caffe/layers/lrn_layer.hpp
+++ b/include/caffe/layers/lrn_layer.hpp
@@ -36,23 +36,15 @@ class LRNLayer : public Layer<Dtype> {
  protected:
   virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
   virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
   virtual void CrossChannelForward_cpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
-  virtual void CrossChannelForward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
   virtual void WithinChannelForward(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
   virtual void CrossChannelBackward_cpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void CrossChannelBackward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
   virtual void WithinChannelBackward(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
diff --git a/include/caffe/layers/lstm_layer.hpp b/include/caffe/layers/lstm_layer.hpp
index a0e67c9..bcfbfe7 100644
--- a/include/caffe/layers/lstm_layer.hpp
+++ b/include/caffe/layers/lstm_layer.hpp
@@ -105,8 +105,6 @@ class LSTMUnitLayer : public Layer<Dtype> {
    */
   virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
 
   /**
    * @brief Computes the error gradient w.r.t. the LSTMUnit inputs.
@@ -141,8 +139,6 @@ class LSTMUnitLayer : public Layer<Dtype> {
    */
   virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
   /// @brief The hidden and output dimension.
   int hidden_dim_;
diff --git a/include/caffe/layers/memory_data_layer.hpp b/include/caffe/layers/memory_data_layer.hpp
deleted file mode 100644
index 8abcc8c..0000000
--- a/include/caffe/layers/memory_data_layer.hpp
+++ /dev/null
@@ -1,63 +0,0 @@
-#ifndef CAFFE_MEMORY_DATA_LAYER_HPP_
-#define CAFFE_MEMORY_DATA_LAYER_HPP_
-
-#include <vector>
-
-#include "caffe/blob.hpp"
-#include "caffe/layer.hpp"
-#include "caffe/proto/caffe.pb.h"
-
-#include "caffe/layers/base_data_layer.hpp"
-
-namespace caffe {
-
-/**
- * @brief Provides data to the Net from memory.
- *
- * TODO(dox): thorough documentation for Forward and proto params.
- */
-template <typename Dtype>
-class MemoryDataLayer : public BaseDataLayer<Dtype> {
- public:
-  explicit MemoryDataLayer(const LayerParameter& param)
-      : BaseDataLayer<Dtype>(param), has_new_data_(false) {}
-  virtual void DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  virtual inline const char* type() const { return "MemoryData"; }
-  virtual inline int ExactNumBottomBlobs() const { return 0; }
-  virtual inline int ExactNumTopBlobs() const { return 2; }
-
-  virtual void AddDatumVector(const vector<Datum>& datum_vector);
-#ifdef USE_OPENCV
-  virtual void AddMatVector(const vector<cv::Mat>& mat_vector,
-      const vector<int>& labels);
-#endif  // USE_OPENCV
-
-  // Reset should accept const pointers, but can't, because the memory
-  //  will be given to Blob, which is mutable
-  void Reset(Dtype* data, Dtype* label, int n);
-  void set_batch_size(int new_size);
-
-  int batch_size() { return batch_size_; }
-  int channels() { return channels_; }
-  int height() { return height_; }
-  int width() { return width_; }
-
- protected:
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  int batch_size_, channels_, height_, width_, size_;
-  Dtype* data_;
-  Dtype* labels_;
-  int n_;
-  size_t pos_;
-  Blob<Dtype> added_data_;
-  Blob<Dtype> added_label_;
-  bool has_new_data_;
-};
-
-}  // namespace caffe
-
-#endif  // CAFFE_MEMORY_DATA_LAYER_HPP_
diff --git a/include/caffe/layers/mvn_layer.hpp b/include/caffe/layers/mvn_layer.hpp
index 3a235ce..0e25b87 100644
--- a/include/caffe/layers/mvn_layer.hpp
+++ b/include/caffe/layers/mvn_layer.hpp
@@ -29,12 +29,8 @@ class MVNLayer : public Layer<Dtype> {
  protected:
   virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
   virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-     const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
   Blob<Dtype> mean_, variance_, temp_;
 
diff --git a/include/caffe/layers/pooling_layer.hpp b/include/caffe/layers/pooling_layer.hpp
index f4d6803..b0854b2 100644
--- a/include/caffe/layers/pooling_layer.hpp
+++ b/include/caffe/layers/pooling_layer.hpp
@@ -37,12 +37,8 @@ class PoolingLayer : public Layer<Dtype> {
  protected:
   virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
   virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
   int kernel_h_, kernel_w_;
   int stride_h_, stride_w_;
diff --git a/include/caffe/layers/power_layer.hpp b/include/caffe/layers/power_layer.hpp
index 6ecbafc..75b98e1 100644
--- a/include/caffe/layers/power_layer.hpp
+++ b/include/caffe/layers/power_layer.hpp
@@ -46,8 +46,6 @@ class PowerLayer : public NeuronLayer<Dtype> {
    */
   virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
 
   /**
    * @brief Computes the error gradient w.r.t. the power inputs.
@@ -71,8 +69,6 @@ class PowerLayer : public NeuronLayer<Dtype> {
    */
   virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
   /// @brief @f$ \gamma @f$ from layer_param_.power_param()
   Dtype power_;
diff --git a/include/caffe/layers/prelu_layer.hpp b/include/caffe/layers/prelu_layer.hpp
index 3ddfb48..0725143 100644
--- a/include/caffe/layers/prelu_layer.hpp
+++ b/include/caffe/layers/prelu_layer.hpp
@@ -54,8 +54,6 @@ class PReLULayer : public NeuronLayer<Dtype> {
    */
   virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
 
   /**
    * @brief Computes the error gradient w.r.t. the PReLU inputs.
@@ -87,8 +85,6 @@ class PReLULayer : public NeuronLayer<Dtype> {
    */
   virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
   bool channel_shared_;
   Blob<Dtype> multiplier_;  // dot multiplier for backward computation of params
diff --git a/include/caffe/layers/python_layer.hpp b/include/caffe/layers/python_layer.hpp
deleted file mode 100644
index 66dbbdf..0000000
--- a/include/caffe/layers/python_layer.hpp
+++ /dev/null
@@ -1,59 +0,0 @@
-#ifndef CAFFE_PYTHON_LAYER_HPP_
-#define CAFFE_PYTHON_LAYER_HPP_
-
-#include <boost/python.hpp>
-#include <vector>
-
-#include "caffe/layer.hpp"
-
-namespace bp = boost::python;
-
-namespace caffe {
-
-template <typename Dtype>
-class PythonLayer : public Layer<Dtype> {
- public:
-  PythonLayer(PyObject* self, const LayerParameter& param)
-      : Layer<Dtype>(param), self_(bp::handle<>(bp::borrowed(self))) { }
-
-  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-    // Disallow PythonLayer in MultiGPU training stage, due to GIL issues
-    // Details: https://github.com/BVLC/caffe/issues/2936
-    if (this->phase_ == TRAIN && Caffe::solver_count() > 1
-        && !ShareInParallel()) {
-      LOG(FATAL) << "PythonLayer is not implemented in Multi-GPU training";
-    }
-    self_.attr("param_str") = bp::str(
-        this->layer_param_.python_param().param_str());
-    self_.attr("phase") = static_cast<int>(this->phase_);
-    self_.attr("setup")(bottom, top);
-  }
-  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-    self_.attr("reshape")(bottom, top);
-  }
-
-  virtual inline bool ShareInParallel() const {
-    return this->layer_param_.python_param().share_in_parallel();
-  }
-
-  virtual inline const char* type() const { return "Python"; }
-
- protected:
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-    self_.attr("forward")(bottom, top);
-  }
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-    self_.attr("backward")(top, propagate_down, bottom);
-  }
-
- private:
-  bp::object self_;
-};
-
-}  // namespace caffe
-
-#endif
diff --git a/include/caffe/layers/recurrent_layer.hpp b/include/caffe/layers/recurrent_layer.hpp
index ca17371..b5526fe 100644
--- a/include/caffe/layers/recurrent_layer.hpp
+++ b/include/caffe/layers/recurrent_layer.hpp
@@ -142,8 +142,6 @@ class RecurrentLayer : public Layer<Dtype> {
    */
   virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
   virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
diff --git a/include/caffe/layers/reduction_layer.hpp b/include/caffe/layers/reduction_layer.hpp
index 804a495..cf239f6 100644
--- a/include/caffe/layers/reduction_layer.hpp
+++ b/include/caffe/layers/reduction_layer.hpp
@@ -33,12 +33,8 @@ class ReductionLayer : public Layer<Dtype> {
  protected:
   virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
   virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
   /// @brief the reduction operation performed by the layer
   ReductionParameter_ReductionOp op_;
diff --git a/include/caffe/layers/relu_layer.hpp b/include/caffe/layers/relu_layer.hpp
index d7a73f7..3070f54 100644
--- a/include/caffe/layers/relu_layer.hpp
+++ b/include/caffe/layers/relu_layer.hpp
@@ -43,8 +43,6 @@ class ReLULayer : public NeuronLayer<Dtype> {
    */
   virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
 
   /**
    * @brief Computes the error gradient w.r.t. the ReLU inputs.
@@ -76,8 +74,6 @@ class ReLULayer : public NeuronLayer<Dtype> {
    */
   virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 };
 
 }  // namespace caffe
diff --git a/include/caffe/layers/reshape_layer.hpp b/include/caffe/layers/reshape_layer.hpp
index d11e063..90c038d 100644
--- a/include/caffe/layers/reshape_layer.hpp
+++ b/include/caffe/layers/reshape_layer.hpp
@@ -34,10 +34,6 @@ class ReshapeLayer : public Layer<Dtype> {
       const vector<Blob<Dtype>*>& top) {}
   virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {}
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {}
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {}
 
   /// @brief vector of axes indices whose dimensions we'll copy from the bottom
   vector<int> copy_axes_;
diff --git a/include/caffe/layers/scale_layer.hpp b/include/caffe/layers/scale_layer.hpp
index 924df2e..892e159 100644
--- a/include/caffe/layers/scale_layer.hpp
+++ b/include/caffe/layers/scale_layer.hpp
@@ -58,12 +58,8 @@ class ScaleLayer: public Layer<Dtype> {
    */
   virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
   virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
   shared_ptr<Layer<Dtype> > bias_layer_;
   vector<Blob<Dtype>*> bias_bottom_vec_;
diff --git a/include/caffe/layers/sigmoid_cross_entropy_loss_layer.hpp b/include/caffe/layers/sigmoid_cross_entropy_loss_layer.hpp
index 598dca5..a25b16c 100644
--- a/include/caffe/layers/sigmoid_cross_entropy_loss_layer.hpp
+++ b/include/caffe/layers/sigmoid_cross_entropy_loss_layer.hpp
@@ -92,8 +92,6 @@ class SigmoidCrossEntropyLossLayer : public LossLayer<Dtype> {
    */
   virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
   /// The internal SigmoidLayer used to map predictions to probabilities.
   shared_ptr<SigmoidLayer<Dtype> > sigmoid_layer_;
diff --git a/include/caffe/layers/sigmoid_layer.hpp b/include/caffe/layers/sigmoid_layer.hpp
index ac0f692..31e765e 100644
--- a/include/caffe/layers/sigmoid_layer.hpp
+++ b/include/caffe/layers/sigmoid_layer.hpp
@@ -40,8 +40,6 @@ class SigmoidLayer : public NeuronLayer<Dtype> {
    */
   virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
 
   /**
    * @brief Computes the error gradient w.r.t. the sigmoid inputs.
@@ -62,8 +60,6 @@ class SigmoidLayer : public NeuronLayer<Dtype> {
    */
   virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 };
 
 }  // namespace caffe
diff --git a/include/caffe/layers/silence_layer.hpp b/include/caffe/layers/silence_layer.hpp
index fba087f..412c9fa 100644
--- a/include/caffe/layers/silence_layer.hpp
+++ b/include/caffe/layers/silence_layer.hpp
@@ -28,14 +28,8 @@ class SilenceLayer : public Layer<Dtype> {
  protected:
   virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {}
-  // We can't define Forward_gpu here, since STUB_GPU will provide
-  // its own definition for CPU_ONLY mode.
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
   virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 };
 
 }  // namespace caffe
diff --git a/include/caffe/layers/slice_layer.hpp b/include/caffe/layers/slice_layer.hpp
index 10a0abb..a241797 100644
--- a/include/caffe/layers/slice_layer.hpp
+++ b/include/caffe/layers/slice_layer.hpp
@@ -32,12 +32,8 @@ class SliceLayer : public Layer<Dtype> {
  protected:
   virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
   virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
   int count_;
   int num_slices_;
diff --git a/include/caffe/layers/softmax_layer.hpp b/include/caffe/layers/softmax_layer.hpp
index c65b870..461955b 100644
--- a/include/caffe/layers/softmax_layer.hpp
+++ b/include/caffe/layers/softmax_layer.hpp
@@ -29,12 +29,8 @@ class SoftmaxLayer : public Layer<Dtype> {
  protected:
   virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
   virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-     const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
   int outer_num_;
   int inner_num_;
diff --git a/include/caffe/layers/softmax_loss_layer.hpp b/include/caffe/layers/softmax_loss_layer.hpp
index f07e8a0..74452c3 100644
--- a/include/caffe/layers/softmax_loss_layer.hpp
+++ b/include/caffe/layers/softmax_loss_layer.hpp
@@ -66,8 +66,6 @@ class SoftmaxWithLossLayer : public LossLayer<Dtype> {
  protected:
   virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
   /**
    * @brief Computes the softmax loss error gradient w.r.t. the predictions.
    *
@@ -97,8 +95,6 @@ class SoftmaxWithLossLayer : public LossLayer<Dtype> {
    */
   virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
   /// Read the normalization mode parameter and compute the normalizer based
   /// on the blob size.  If normalization_mode is VALID, the count of valid
diff --git a/include/caffe/layers/split_layer.hpp b/include/caffe/layers/split_layer.hpp
index 8140dfc..f704202 100644
--- a/include/caffe/layers/split_layer.hpp
+++ b/include/caffe/layers/split_layer.hpp
@@ -30,12 +30,8 @@ class SplitLayer : public Layer<Dtype> {
  protected:
   virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
   virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
   int count_;
 };
diff --git a/include/caffe/layers/tanh_layer.hpp b/include/caffe/layers/tanh_layer.hpp
index 8f95e93..4b81013 100644
--- a/include/caffe/layers/tanh_layer.hpp
+++ b/include/caffe/layers/tanh_layer.hpp
@@ -40,8 +40,6 @@ class TanHLayer : public NeuronLayer<Dtype> {
    */
   virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
 
   /**
    * @brief Computes the error gradient w.r.t. the sigmoid inputs.
@@ -64,8 +62,6 @@ class TanHLayer : public NeuronLayer<Dtype> {
    */
   virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 };
 
 }  // namespace caffe
diff --git a/include/caffe/layers/threshold_layer.hpp b/include/caffe/layers/threshold_layer.hpp
index 3bf4db6..686a605 100644
--- a/include/caffe/layers/threshold_layer.hpp
+++ b/include/caffe/layers/threshold_layer.hpp
@@ -48,8 +48,6 @@ class ThresholdLayer : public NeuronLayer<Dtype> {
    */
   virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
   /// @brief Not implemented (non-differentiable function)
   virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
diff --git a/include/caffe/layers/tile_layer.hpp b/include/caffe/layers/tile_layer.hpp
index fbdbe2f..9e08462 100644
--- a/include/caffe/layers/tile_layer.hpp
+++ b/include/caffe/layers/tile_layer.hpp
@@ -27,13 +27,9 @@ class TileLayer : public Layer<Dtype> {
  protected:
   virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
 
   virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
   unsigned int axis_, tiles_, outer_dim_, inner_dim_;
 };
diff --git a/include/caffe/layers/window_data_layer.hpp b/include/caffe/layers/window_data_layer.hpp
deleted file mode 100644
index 35f41b8..0000000
--- a/include/caffe/layers/window_data_layer.hpp
+++ /dev/null
@@ -1,55 +0,0 @@
-#ifndef CAFFE_WINDOW_DATA_LAYER_HPP_
-#define CAFFE_WINDOW_DATA_LAYER_HPP_
-
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "caffe/blob.hpp"
-#include "caffe/data_transformer.hpp"
-#include "caffe/internal_thread.hpp"
-#include "caffe/layer.hpp"
-#include "caffe/layers/base_data_layer.hpp"
-#include "caffe/proto/caffe.pb.h"
-
-namespace caffe {
-
-/**
- * @brief Provides data to the Net from windows of images files, specified
- *        by a window data file.
- *
- * TODO(dox): thorough documentation for Forward and proto params.
- */
-template <typename Dtype>
-class WindowDataLayer : public BasePrefetchingDataLayer<Dtype> {
- public:
-  explicit WindowDataLayer(const LayerParameter& param)
-      : BasePrefetchingDataLayer<Dtype>(param) {}
-  virtual ~WindowDataLayer();
-  virtual void DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  virtual inline const char* type() const { return "WindowData"; }
-  virtual inline int ExactNumBottomBlobs() const { return 0; }
-  virtual inline int ExactNumTopBlobs() const { return 2; }
-
- protected:
-  virtual unsigned int PrefetchRand();
-  virtual void load_batch(Batch<Dtype>* batch);
-
-  shared_ptr<Caffe::RNG> prefetch_rng_;
-  vector<std::pair<std::string, vector<int> > > image_database_;
-  enum WindowField { IMAGE_INDEX, LABEL, OVERLAP, X1, Y1, X2, Y2, NUM };
-  vector<vector<float> > fg_windows_;
-  vector<vector<float> > bg_windows_;
-  Blob<Dtype> data_mean_;
-  vector<Dtype> mean_values_;
-  bool has_mean_file_;
-  bool has_mean_values_;
-  bool cache_images_;
-  vector<std::pair<std::string, Datum > > image_database_cache_;
-};
-
-}  // namespace caffe
-
-#endif  // CAFFE_WINDOW_DATA_LAYER_HPP_
diff --git a/include/caffe/net.hpp b/include/caffe/net.hpp
index 0addb3c..a8c9ca4 100644
--- a/include/caffe/net.hpp
+++ b/include/caffe/net.hpp
@@ -113,11 +113,8 @@ class Net {
   void CopyTrainedLayersFrom(const NetParameter& param);
   void CopyTrainedLayersFrom(const string trained_filename);
   void CopyTrainedLayersFromBinaryProto(const string trained_filename);
-  void CopyTrainedLayersFromHDF5(const string trained_filename);
   /// @brief Writes the net to a proto.
   void ToProto(NetParameter* param, bool write_diff = false) const;
-  /// @brief Writes the net to an HDF5 file.
-  void ToHDF5(const string& filename, bool write_diff = false) const;
 
   /// @brief returns the network name.
   inline const string& name() const { return name_; }
@@ -241,13 +238,6 @@ class Net {
   void AppendParam(const NetParameter& param, const int layer_id,
                    const int param_id);
 
-  /// @brief Helper for displaying debug info in Forward.
-  void ForwardDebugInfo(const int layer_id);
-  /// @brief Helper for displaying debug info in Backward.
-  void BackwardDebugInfo(const int layer_id);
-  /// @brief Helper for displaying debug info in Update.
-  void UpdateDebugInfo(const int param_id);
-
   /// @brief The network name
   string name_;
   /// @brief The phase: TRAIN or TEST
diff --git a/include/caffe/parallel.hpp b/include/caffe/parallel.hpp
deleted file mode 100644
index 6c496c8..0000000
--- a/include/caffe/parallel.hpp
+++ /dev/null
@@ -1,121 +0,0 @@
-#ifndef CAFFE_PARALLEL_HPP_
-#define CAFFE_PARALLEL_HPP_
-
-#include <boost/date_time/posix_time/posix_time.hpp>
-
-#include <vector>
-
-#include "caffe/blob.hpp"
-#include "caffe/common.hpp"
-#include "caffe/internal_thread.hpp"
-#include "caffe/layer.hpp"
-#include "caffe/proto/caffe.pb.h"
-#include "caffe/solver.hpp"
-#include "caffe/syncedmem.hpp"
-#include "caffe/util/blocking_queue.hpp"
-
-namespace caffe {
-
-// Represents a net parameters. Once a net is created, its parameter buffers can
-// be replaced by ones from Params, to allow parallelization. Params ensures
-// parameters are allocated in one consecutive array.
-template<typename Dtype>
-class Params {
- public:
-  explicit Params(shared_ptr<Solver<Dtype> > root_solver);
-  virtual ~Params() {
-  }
-
-  inline size_t size() const {
-    return size_;
-  }
-  inline Dtype* data() const {
-    return data_;
-  }
-  inline Dtype* diff() const {
-    return diff_;
-  }
-
- protected:
-  const size_t size_;           // Size of buffers
-  Dtype* data_;                 // Network parameters
-  Dtype* diff_;                 // Gradient
-
-DISABLE_COPY_AND_ASSIGN(Params);
-};
-
-// Params stored in GPU memory.
-template<typename Dtype>
-class GPUParams : public Params<Dtype> {
- public:
-  GPUParams(shared_ptr<Solver<Dtype> > root_solver, int device);
-  virtual ~GPUParams();
-
-  void configure(Solver<Dtype>* solver) const;
-
- protected:
-  using Params<Dtype>::size_;
-  using Params<Dtype>::data_;
-  using Params<Dtype>::diff_;
-};
-
-class DevicePair {
- public:
-  DevicePair(int parent, int device)
-      : parent_(parent),
-        device_(device) {
-  }
-  inline int parent() {
-    return parent_;
-  }
-  inline int device() {
-    return device_;
-  }
-
-  // Group GPUs in pairs, by proximity depending on machine's topology
-  static void compute(const vector<int> devices, vector<DevicePair>* pairs);
-
- protected:
-  int parent_;
-  int device_;
-};
-
-// Synchronous data parallelism using map-reduce between local GPUs.
-template<typename Dtype>
-class P2PSync : public GPUParams<Dtype>, public Solver<Dtype>::Callback,
-    public InternalThread {
- public:
-  explicit P2PSync(shared_ptr<Solver<Dtype> > root_solver,
-                   P2PSync<Dtype>* parent, const SolverParameter& param);
-  virtual ~P2PSync();
-
-  inline const shared_ptr<Solver<Dtype> >& solver() const {
-    return solver_;
-  }
-
-  void Run(const vector<int>& gpus);
-  void Prepare(const vector<int>& gpus,
-               vector<shared_ptr<P2PSync<Dtype> > >* syncs);
-  inline const int initial_iter() const { return initial_iter_; }
-
- protected:
-  void on_start();
-  void on_gradients_ready();
-
-  void InternalThreadEntry();
-
-  P2PSync<Dtype>* parent_;
-  vector<P2PSync<Dtype>*> children_;
-  BlockingQueue<P2PSync<Dtype>*> queue_;
-  const int initial_iter_;
-  Dtype* parent_grads_;
-  shared_ptr<Solver<Dtype> > solver_;
-
-  using Params<Dtype>::size_;
-  using Params<Dtype>::data_;
-  using Params<Dtype>::diff_;
-};
-
-}  // namespace caffe
-
-#endif
diff --git a/include/caffe/sgd_solvers.hpp b/include/caffe/sgd_solvers.hpp
deleted file mode 100644
index 1fc52d8..0000000
--- a/include/caffe/sgd_solvers.hpp
+++ /dev/null
@@ -1,148 +0,0 @@
-#ifndef CAFFE_SGD_SOLVERS_HPP_
-#define CAFFE_SGD_SOLVERS_HPP_
-
-#include <string>
-#include <vector>
-
-#include "caffe/solver.hpp"
-
-namespace caffe {
-
-/**
- * @brief Optimizes the parameters of a Net using
- *        stochastic gradient descent (SGD) with momentum.
- */
-template <typename Dtype>
-class SGDSolver : public Solver<Dtype> {
- public:
-  explicit SGDSolver(const SolverParameter& param)
-      : Solver<Dtype>(param) { PreSolve(); }
-  explicit SGDSolver(const string& param_file)
-      : Solver<Dtype>(param_file) { PreSolve(); }
-  virtual inline const char* type() const { return "SGD"; }
-
-  const vector<shared_ptr<Blob<Dtype> > >& history() { return history_; }
-
- protected:
-  void PreSolve();
-  Dtype GetLearningRate();
-  virtual void ApplyUpdate();
-  virtual void Normalize(int param_id);
-  virtual void Regularize(int param_id);
-  virtual void ComputeUpdateValue(int param_id, Dtype rate);
-  virtual void ClipGradients();
-  virtual void SnapshotSolverState(const string& model_filename);
-  virtual void SnapshotSolverStateToBinaryProto(const string& model_filename);
-  virtual void SnapshotSolverStateToHDF5(const string& model_filename);
-  virtual void RestoreSolverStateFromHDF5(const string& state_file);
-  virtual void RestoreSolverStateFromBinaryProto(const string& state_file);
-  // history maintains the historical momentum data.
-  // update maintains update related data and is not needed in snapshots.
-  // temp maintains other information that might be needed in computation
-  //   of gradients/updates and is not needed in snapshots
-  vector<shared_ptr<Blob<Dtype> > > history_, update_, temp_;
-
-  DISABLE_COPY_AND_ASSIGN(SGDSolver);
-};
-
-template <typename Dtype>
-class NesterovSolver : public SGDSolver<Dtype> {
- public:
-  explicit NesterovSolver(const SolverParameter& param)
-      : SGDSolver<Dtype>(param) {}
-  explicit NesterovSolver(const string& param_file)
-      : SGDSolver<Dtype>(param_file) {}
-  virtual inline const char* type() const { return "Nesterov"; }
-
- protected:
-  virtual void ComputeUpdateValue(int param_id, Dtype rate);
-
-  DISABLE_COPY_AND_ASSIGN(NesterovSolver);
-};
-
-template <typename Dtype>
-class AdaGradSolver : public SGDSolver<Dtype> {
- public:
-  explicit AdaGradSolver(const SolverParameter& param)
-      : SGDSolver<Dtype>(param) { constructor_sanity_check(); }
-  explicit AdaGradSolver(const string& param_file)
-      : SGDSolver<Dtype>(param_file) { constructor_sanity_check(); }
-  virtual inline const char* type() const { return "AdaGrad"; }
-
- protected:
-  virtual void ComputeUpdateValue(int param_id, Dtype rate);
-  void constructor_sanity_check() {
-    CHECK_EQ(0, this->param_.momentum())
-        << "Momentum cannot be used with AdaGrad.";
-  }
-
-  DISABLE_COPY_AND_ASSIGN(AdaGradSolver);
-};
-
-
-template <typename Dtype>
-class RMSPropSolver : public SGDSolver<Dtype> {
- public:
-  explicit RMSPropSolver(const SolverParameter& param)
-      : SGDSolver<Dtype>(param) { constructor_sanity_check(); }
-  explicit RMSPropSolver(const string& param_file)
-      : SGDSolver<Dtype>(param_file) { constructor_sanity_check(); }
-  virtual inline const char* type() const { return "RMSProp"; }
-
- protected:
-  virtual void ComputeUpdateValue(int param_id, Dtype rate);
-  void constructor_sanity_check() {
-    CHECK_EQ(0, this->param_.momentum())
-        << "Momentum cannot be used with RMSProp.";
-    CHECK_GE(this->param_.rms_decay(), 0)
-        << "rms_decay should lie between 0 and 1.";
-    CHECK_LT(this->param_.rms_decay(), 1)
-        << "rms_decay should lie between 0 and 1.";
-  }
-
-  DISABLE_COPY_AND_ASSIGN(RMSPropSolver);
-};
-
-template <typename Dtype>
-class AdaDeltaSolver : public SGDSolver<Dtype> {
- public:
-  explicit AdaDeltaSolver(const SolverParameter& param)
-      : SGDSolver<Dtype>(param) { AdaDeltaPreSolve(); }
-  explicit AdaDeltaSolver(const string& param_file)
-      : SGDSolver<Dtype>(param_file) { AdaDeltaPreSolve(); }
-  virtual inline const char* type() const { return "AdaDelta"; }
-
- protected:
-  void AdaDeltaPreSolve();
-  virtual void ComputeUpdateValue(int param_id, Dtype rate);
-
-  DISABLE_COPY_AND_ASSIGN(AdaDeltaSolver);
-};
-
-/**
- * @brief AdamSolver, an algorithm for first-order gradient-based optimization
- *        of stochastic objective functions, based on adaptive estimates of
- *        lower-order moments. Described in [1].
- *
- * [1] D. P. Kingma and J. L. Ba, "ADAM: A Method for Stochastic Optimization."
- *     arXiv preprint arXiv:1412.6980v8 (2014).
- */
-template <typename Dtype>
-class AdamSolver : public SGDSolver<Dtype> {
- public:
-  explicit AdamSolver(const SolverParameter& param)
-      : SGDSolver<Dtype>(param) { AdamPreSolve();}
-  explicit AdamSolver(const string& param_file)
-      : SGDSolver<Dtype>(param_file) { AdamPreSolve(); }
-  virtual inline const char* type() const { return "Adam"; }
-
- protected:
-  void AdamPreSolve();
-  virtual void ComputeUpdateValue(int param_id, Dtype rate);
-
-  DISABLE_COPY_AND_ASSIGN(AdamSolver);
-};
-
-}  // namespace caffe
-
-#endif  // CAFFE_SGD_SOLVERS_HPP_
diff --git a/include/caffe/solver.hpp b/include/caffe/solver.hpp
deleted file mode 100644
index 38259ed..0000000
--- a/include/caffe/solver.hpp
+++ /dev/null
@@ -1,161 +0,0 @@
-#ifndef CAFFE_SOLVER_HPP_
-#define CAFFE_SOLVER_HPP_
-#include <boost/function.hpp>
-#include <string>
-#include <vector>
-
-#include "caffe/net.hpp"
-#include "caffe/solver_factory.hpp"
-
-namespace caffe {
-
-/**
-  * @brief Enumeration of actions that a client of the Solver may request by
-  * implementing the Solver's action request function, which a
-  * a client may optionally provide in order to request early termination
-  * or saving a snapshot without exiting. In the executable caffe, this
-  * mechanism is used to allow the snapshot to be saved when stopping
-  * execution with a SIGINT (Ctrl-C).
-  */
-  namespace SolverAction {
-    enum Enum {
-      NONE = 0,  // Take no special action.
-      STOP = 1,  // Stop training. snapshot_after_train controls whether a
-                 // snapshot is created.
-      SNAPSHOT = 2  // Take a snapshot, and keep training.
-    };
-  }
-
-/**
- * @brief Type of a function that returns a Solver Action enumeration.
- */
-typedef boost::function<SolverAction::Enum()> ActionCallback;
-
-/**
- * @brief An interface for classes that perform optimization on Net%s.
- *
- * Requires implementation of ApplyUpdate to compute a parameter update
- * given the current state of the Net parameters.
- */
-template <typename Dtype>
-class Solver {
- public:
-  explicit Solver(const SolverParameter& param,
-      const Solver* root_solver = NULL);
-  explicit Solver(const string& param_file, const Solver* root_solver = NULL);
-  void Init(const SolverParameter& param);
-  void InitTrainNet();
-  void InitTestNets();
-
-  // Client of the Solver optionally may call this in order to set the function
-  // that the solver uses to see what action it should take (e.g. snapshot or
-  // exit training early).
-  void SetActionFunction(ActionCallback func);
-  SolverAction::Enum GetRequestedAction();
-  // The main entry of the solver function. In default, iter will be zero. Pass
-  // in a non-zero iter number to resume training for a pre-trained net.
-  virtual void Solve(const char* resume_file = NULL);
-  inline void Solve(const string resume_file) { Solve(resume_file.c_str()); }
-  void Step(int iters);
-  // The Restore method simply dispatches to one of the
-  // RestoreSolverStateFrom___ protected methods. You should implement these
-  // methods to restore the state from the appropriate snapshot type.
-  void Restore(const char* resume_file);
-  // The Solver::Snapshot function implements the basic snapshotting utility
-  // that stores the learned net. You should implement the SnapshotSolverState()
-  // function that produces a SolverState protocol buffer that needs to be
-  // written to disk together with the learned net.
-  void Snapshot();
-  virtual ~Solver() {}
-  inline const SolverParameter& param() const { return param_; }
-  inline shared_ptr<Net<Dtype> > net() { return net_; }
-  inline const vector<shared_ptr<Net<Dtype> > >& test_nets() {
-    return test_nets_;
-  }
-  int iter() { return iter_; }
-
-  // Invoked at specific points during an iteration
-  class Callback {
-   protected:
-    virtual void on_start() = 0;
-    virtual void on_gradients_ready() = 0;
-
-    template <typename T>
-    friend class Solver;
-  };
-  const vector<Callback*>& callbacks() const { return callbacks_; }
-  void add_callback(Callback* value) {
-    callbacks_.push_back(value);
-  }
-
-  void CheckSnapshotWritePermissions();
-  /**
-   * @brief Returns the solver type.
-   */
-  virtual inline const char* type() const { return ""; }
-
- protected:
-  // Make and apply the update value for the current iteration.
-  virtual void ApplyUpdate() = 0;
-  string SnapshotFilename(const string extension);
-  string SnapshotToBinaryProto();
-  string SnapshotToHDF5();
-  // The test routine
-  void TestAll();
-  void Test(const int test_net_id = 0);
-  virtual void SnapshotSolverState(const string& model_filename) = 0;
-  virtual void RestoreSolverStateFromHDF5(const string& state_file) = 0;
-  virtual void RestoreSolverStateFromBinaryProto(const string& state_file) = 0;
-  void DisplayOutputBlobs(const int net_id);
-  void UpdateSmoothedLoss(Dtype loss, int start_iter, int average_loss);
-
-  SolverParameter param_;
-  int iter_;
-  int current_step_;
-  shared_ptr<Net<Dtype> > net_;
-  vector<shared_ptr<Net<Dtype> > > test_nets_;
-  vector<Callback*> callbacks_;
-  vector<Dtype> losses_;
-  Dtype smoothed_loss_;
-
-  // The root solver that holds root nets (actually containing shared layers)
-  // in data parallelism
-  const Solver* const root_solver_;
-
-  // A function that can be set by a client of the Solver to provide indication
-  // that it wants a snapshot saved and/or to exit early.
-  ActionCallback action_request_function_;
-
-  // True iff a request to stop early was received.
-  bool requested_early_exit_;
-
-  DISABLE_COPY_AND_ASSIGN(Solver);
-};
-
-/**
- * @brief Solver that only computes gradients, used as worker
- *        for multi-GPU training.
- */
-template <typename Dtype>
-class WorkerSolver : public Solver<Dtype> {
- public:
-  explicit WorkerSolver(const SolverParameter& param,
-      const Solver<Dtype>* root_solver = NULL)
-      : Solver<Dtype>(param, root_solver) {}
-
- protected:
-  void ApplyUpdate() {}
-  void SnapshotSolverState(const string& model_filename) {
-    LOG(FATAL) << "Should not be called on worker solver.";
-  }
-  void RestoreSolverStateFromBinaryProto(const string& state_file) {
-    LOG(FATAL) << "Should not be called on worker solver.";
-  }
-  void RestoreSolverStateFromHDF5(const string& state_file) {
-    LOG(FATAL) << "Should not be called on worker solver.";
-  }
-};
-
-}  // namespace caffe
-
-#endif  // CAFFE_SOLVER_HPP_
diff --git a/include/caffe/solver_factory.hpp b/include/caffe/solver_factory.hpp
deleted file mode 100644
index cfff721..0000000
--- a/include/caffe/solver_factory.hpp
+++ /dev/null
@@ -1,137 +0,0 @@
-/**
- * @brief A solver factory that allows one to register solvers, similar to
- * layer factory. During runtime, registered solvers could be called by passing
- * a SolverParameter protobuffer to the CreateSolver function:
- *
- *     SolverRegistry<Dtype>::CreateSolver(param);
- *
- * There are two ways to register a solver. Assuming that we have a solver like:
- *
- *   template <typename Dtype>
- *   class MyAwesomeSolver : public Solver<Dtype> {
- *     // your implementations
- *   };
- *
- * and its type is its C++ class name, but without the "Solver" at the end
- * ("MyAwesomeSolver" -> "MyAwesome").
- *
- * If the solver is going to be created simply by its constructor, in your c++
- * file, add the following line:
- *
- *    REGISTER_SOLVER_CLASS(MyAwesome);
- *
- * Or, if the solver is going to be created by another creator function, in the
- * format of:
- *
- *    template <typename Dtype>
- *    Solver<Dtype*> GetMyAwesomeSolver(const SolverParameter& param) {
- *      // your implementation
- *    }
- *
- * then you can register the creator function instead, like
- *
- * REGISTER_SOLVER_CREATOR(MyAwesome, GetMyAwesomeSolver)
- *
- * Note that each solver type should only be registered once.
- */
-
-#ifndef CAFFE_SOLVER_FACTORY_H_
-#define CAFFE_SOLVER_FACTORY_H_
-
-#include <map>
-#include <string>
-#include <vector>
-
-#include "caffe/common.hpp"
-#include "caffe/proto/caffe.pb.h"
-
-namespace caffe {
-
-template <typename Dtype>
-class Solver;
-
-template <typename Dtype>
-class SolverRegistry {
- public:
-  typedef Solver<Dtype>* (*Creator)(const SolverParameter&);
-  typedef std::map<string, Creator> CreatorRegistry;
-
-  static CreatorRegistry& Registry() {
-    static CreatorRegistry* g_registry_ = new CreatorRegistry();
-    return *g_registry_;
-  }
-
-  // Adds a creator.
-  static void AddCreator(const string& type, Creator creator) {
-    CreatorRegistry& registry = Registry();
-    CHECK_EQ(registry.count(type), 0)
-        << "Solver type " << type << " already registered.";
-    registry[type] = creator;
-  }
-
-  // Get a solver using a SolverParameter.
-  static Solver<Dtype>* CreateSolver(const SolverParameter& param) {
-    const string& type = param.type();
-    CreatorRegistry& registry = Registry();
-    CHECK_EQ(registry.count(type), 1) << "Unknown solver type: " << type
-        << " (known types: " << SolverTypeListString() << ")";
-    return registry[type](param);
-  }
-
-  static vector<string> SolverTypeList() {
-    CreatorRegistry& registry = Registry();
-    vector<string> solver_types;
-    for (typename CreatorRegistry::iterator iter = registry.begin();
-         iter != registry.end(); ++iter) {
-      solver_types.push_back(iter->first);
-    }
-    return solver_types;
-  }
-
- private:
-  // Solver registry should never be instantiated - everything is done with its
-  // static variables.
-  SolverRegistry() {}
-
-  static string SolverTypeListString() {
-    vector<string> solver_types = SolverTypeList();
-    string solver_types_str;
-    for (vector<string>::iterator iter = solver_types.begin();
-         iter != solver_types.end(); ++iter) {
-      if (iter != solver_types.begin()) {
-        solver_types_str += ", ";
-      }
-      solver_types_str += *iter;
-    }
-    return solver_types_str;
-  }
-};
-
-
-template <typename Dtype>
-class SolverRegisterer {
- public:
-  SolverRegisterer(const string& type,
-      Solver<Dtype>* (*creator)(const SolverParameter&)) {
-    // LOG(INFO) << "Registering solver type: " << type;
-    SolverRegistry<Dtype>::AddCreator(type, creator);
-  }
-};
-
-
-#define REGISTER_SOLVER_CREATOR(type, creator)                                 \
-  static SolverRegisterer<float> g_creator_f_##type(#type, creator<float>);    \
-  static SolverRegisterer<double> g_creator_d_##type(#type, creator<double>)   \
-
-#define REGISTER_SOLVER_CLASS(type)                                            \
-  template <typename Dtype>                                                    \
-  Solver<Dtype>* Creator_##type##Solver(                                       \
-      const SolverParameter& param)                                            \
-  {                                                                            \
-    return new type##Solver<Dtype>(param);                                     \
-  }                                                                            \
-  REGISTER_SOLVER_CREATOR(type, Creator_##type##Solver)
-
-}  // namespace caffe
-
-#endif  // CAFFE_SOLVER_FACTORY_H_
diff --git a/include/caffe/syncedmem.hpp b/include/caffe/syncedmem.hpp
index 38ee466..a845b1f 100644
--- a/include/caffe/syncedmem.hpp
+++ b/include/caffe/syncedmem.hpp
@@ -13,25 +13,12 @@ namespace caffe {
 // but might be more significant for parallel training. Most importantly,
 // it improved stability for large models on many GPUs.
 inline void CaffeMallocHost(void** ptr, size_t size, bool* use_cuda) {
-#ifndef CPU_ONLY
-  if (Caffe::mode() == Caffe::GPU) {
-    CUDA_CHECK(cudaMallocHost(ptr, size));
-    *use_cuda = true;
-    return;
-  }
-#endif
   *ptr = malloc(size);
   *use_cuda = false;
   CHECK(*ptr) << "host allocation of size " << size << " failed";
 }
 
 inline void CaffeFreeHost(void* ptr, bool use_cuda) {
-#ifndef CPU_ONLY
-  if (use_cuda) {
-    CUDA_CHECK(cudaFreeHost(ptr));
-    return;
-  }
-#endif
   free(ptr);
 }
 
@@ -63,10 +50,6 @@ class SyncedMemory {
   SyncedHead head() { return head_; }
   size_t size() { return size_; }
 
-#ifndef CPU_ONLY
-  void async_gpu_push(const cudaStream_t& stream);
-#endif
-
  private:
   void to_cpu();
   void to_gpu();
diff --git a/include/caffe/test/test_caffe_main.hpp b/include/caffe/test/test_caffe_main.hpp
deleted file mode 100644
index fc15609..0000000
--- a/include/caffe/test/test_caffe_main.hpp
+++ /dev/null
@@ -1,78 +0,0 @@
-// The main caffe test code. Your test cpp code should include this hpp
-// to allow a main function to be compiled into the binary.
-#ifndef CAFFE_TEST_TEST_CAFFE_MAIN_HPP_
-#define CAFFE_TEST_TEST_CAFFE_MAIN_HPP_
-
-#include <glog/logging.h>
-#include <gtest/gtest.h>
-
-#include <cstdio>
-#include <cstdlib>
-
-#include "caffe/common.hpp"
-
-using std::cout;
-using std::endl;
-
-#ifdef CMAKE_BUILD
-  #include "caffe_config.h"
-#else
-  #define CUDA_TEST_DEVICE -1
-  #define CMAKE_SOURCE_DIR "src/"
-  #define EXAMPLES_SOURCE_DIR "examples/"
-  #define CMAKE_EXT ""
-#endif
-
-int main(int argc, char** argv);
-
-namespace caffe {
-
-template <typename TypeParam>
-class MultiDeviceTest : public ::testing::Test {
- public:
-  typedef typename TypeParam::Dtype Dtype;
- protected:
-  MultiDeviceTest() {
-    Caffe::set_mode(TypeParam::device);
-  }
-  virtual ~MultiDeviceTest() {}
-};
-
-typedef ::testing::Types<float, double> TestDtypes;
-
-template <typename TypeParam>
-struct CPUDevice {
-  typedef TypeParam Dtype;
-  static const Caffe::Brew device = Caffe::CPU;
-};
-
-template <typename Dtype>
-class CPUDeviceTest : public MultiDeviceTest<CPUDevice<Dtype> > {
-};
-
-#ifdef CPU_ONLY
-
-typedef ::testing::Types<CPUDevice<float>,
-                         CPUDevice<double> > TestDtypesAndDevices;
-
-#else
-
-template <typename TypeParam>
-struct GPUDevice {
-  typedef TypeParam Dtype;
-  static const Caffe::Brew device = Caffe::GPU;
-};
-
-template <typename Dtype>
-class GPUDeviceTest : public MultiDeviceTest<GPUDevice<Dtype> > {
-};
-
-typedef ::testing::Types<CPUDevice<float>, CPUDevice<double>,
-                         GPUDevice<float>, GPUDevice<double> >
-                         TestDtypesAndDevices;
-
-#endif
-
-}  // namespace caffe
-
-#endif  // CAFFE_TEST_TEST_CAFFE_MAIN_HPP_
diff --git a/include/caffe/test/test_gradient_check_util.hpp b/include/caffe/test/test_gradient_check_util.hpp
deleted file mode 100644
index b25a848..0000000
--- a/include/caffe/test/test_gradient_check_util.hpp
+++ /dev/null
@@ -1,266 +0,0 @@
-#ifndef CAFFE_TEST_GRADIENT_CHECK_UTIL_H_
-#define CAFFE_TEST_GRADIENT_CHECK_UTIL_H_
-
-#include <glog/logging.h>
-#include <gtest/gtest.h>
-
-#include <algorithm>
-#include <cmath>
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/net.hpp"
-
-namespace caffe {
-
-// The gradient checker adds a L2 normalization loss function on top of the
-// top blobs, and checks the gradient.
-template <typename Dtype>
-class GradientChecker {
- public:
-  // kink and kink_range specify an ignored nonsmooth region of the form
-  // kink - kink_range <= |feature value| <= kink + kink_range,
-  // which accounts for all nonsmoothness in use by caffe
-  GradientChecker(const Dtype stepsize, const Dtype threshold,
-      const unsigned int seed = 1701, const Dtype kink = 0.,
-      const Dtype kink_range = -1)
-      : stepsize_(stepsize), threshold_(threshold), seed_(seed),
-        kink_(kink), kink_range_(kink_range) {}
-  // Checks the gradient of a layer, with provided bottom layers and top
-  // layers.
-  // Note that after the gradient check, we do not guarantee that the data
-  // stored in the layer parameters and the blobs are unchanged.
-  void CheckGradient(Layer<Dtype>* layer, const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top, int check_bottom = -1) {
-      layer->SetUp(bottom, top);
-      CheckGradientSingle(layer, bottom, top, check_bottom, -1, -1);
-  }
-  void CheckGradientExhaustive(Layer<Dtype>* layer,
-      const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top,
-      int check_bottom = -1);
-
-  // CheckGradientEltwise can be used to test layers that perform element-wise
-  // computation only (e.g., neuron layers) -- where (d y_i) / (d x_j) = 0 when
-  // i != j.
-  void CheckGradientEltwise(Layer<Dtype>* layer,
-      const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
-
-  // Checks the gradient of a single output with respect to particular input
-  // blob(s).  If check_bottom = i >= 0, check only the ith bottom Blob.
-  // If check_bottom == -1, check everything -- all bottom Blobs and all
-  // param Blobs.  Otherwise (if check_bottom < -1), check only param Blobs.
-  void CheckGradientSingle(Layer<Dtype>* layer,
-      const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top,
-      int check_bottom, int top_id, int top_data_id, bool element_wise = false);
-
-  // Checks the gradient of a network. This network should not have any data
-  // layers or loss layers, since the function does not explicitly deal with
-  // such cases yet. All input blobs and parameter blobs are going to be
-  // checked, layer-by-layer to avoid numerical problems to accumulate.
-  void CheckGradientNet(const Net<Dtype>& net,
-      const vector<Blob<Dtype>*>& input);
-
- protected:
-  Dtype GetObjAndGradient(const Layer<Dtype>& layer,
-      const vector<Blob<Dtype>*>& top, int top_id = -1, int top_data_id = -1);
-  Dtype stepsize_;
-  Dtype threshold_;
-  unsigned int seed_;
-  Dtype kink_;
-  Dtype kink_range_;
-};
-
-
-template <typename Dtype>
-void GradientChecker<Dtype>::CheckGradientSingle(Layer<Dtype>* layer,
-    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top,
-    int check_bottom, int top_id, int top_data_id, bool element_wise) {
-  if (element_wise) {
-    CHECK_EQ(0, layer->blobs().size());
-    CHECK_LE(0, top_id);
-    CHECK_LE(0, top_data_id);
-    const int top_count = top[top_id]->count();
-    for (int blob_id = 0; blob_id < bottom.size(); ++blob_id) {
-      CHECK_EQ(top_count, bottom[blob_id]->count());
-    }
-  }
-  // First, figure out what blobs we need to check against, and zero init
-  // parameter blobs.
-  vector<Blob<Dtype>*> blobs_to_check;
-  vector<bool> propagate_down(bottom.size(), check_bottom == -1);
-  for (int i = 0; i < layer->blobs().size(); ++i) {
-    Blob<Dtype>* blob = layer->blobs()[i].get();
-    caffe_set(blob->count(), static_cast<Dtype>(0), blob->mutable_cpu_diff());
-    blobs_to_check.push_back(blob);
-  }
-  if (check_bottom == -1) {
-    for (int i = 0; i < bottom.size(); ++i) {
-      blobs_to_check.push_back(bottom[i]);
-    }
-  } else if (check_bottom >= 0) {
-    CHECK_LT(check_bottom, bottom.size());
-    blobs_to_check.push_back(bottom[check_bottom]);
-    propagate_down[check_bottom] = true;
-  }
-  CHECK_GT(blobs_to_check.size(), 0) << "No blobs to check.";
-  // Compute the gradient analytically using Backward
-  Caffe::set_random_seed(seed_);
-  // Ignore the loss from the layer (it's just the weighted sum of the losses
-  // from the top blobs, whose gradients we may want to test individually).
-  layer->Forward(bottom, top);
-  // Get additional loss from the objective
-  GetObjAndGradient(*layer, top, top_id, top_data_id);
-  layer->Backward(top, propagate_down, bottom);
-  // Store computed gradients for all checked blobs
-  vector<shared_ptr<Blob<Dtype> > >
-      computed_gradient_blobs(blobs_to_check.size());
-  for (int blob_id = 0; blob_id < blobs_to_check.size(); ++blob_id) {
-    Blob<Dtype>* current_blob = blobs_to_check[blob_id];
-    computed_gradient_blobs[blob_id].reset(new Blob<Dtype>());
-    computed_gradient_blobs[blob_id]->ReshapeLike(*current_blob);
-    const int count = blobs_to_check[blob_id]->count();
-    const Dtype* diff = blobs_to_check[blob_id]->cpu_diff();
-    Dtype* computed_gradients =
-        computed_gradient_blobs[blob_id]->mutable_cpu_data();
-    caffe_copy(count, diff, computed_gradients);
-  }
-  // Compute derivative of top w.r.t. each bottom and parameter input using
-  // finite differencing.
-  // LOG(ERROR) << "Checking " << blobs_to_check.size() << " blobs.";
-  for (int blob_id = 0; blob_id < blobs_to_check.size(); ++blob_id) {
-    Blob<Dtype>* current_blob = blobs_to_check[blob_id];
-    const Dtype* computed_gradients =
-        computed_gradient_blobs[blob_id]->cpu_data();
-    // LOG(ERROR) << "Blob " << blob_id << ": checking "
-    //     << current_blob->count() << " parameters.";
-    for (int feat_id = 0; feat_id < current_blob->count(); ++feat_id) {
-      // For an element-wise layer, we only need to do finite differencing to
-      // compute the derivative of top[top_id][top_data_id] w.r.t.
-      // bottom[blob_id][i] only for i == top_data_id.  For any other
-      // i != top_data_id, we know the derivative is 0 by definition, and simply
-      // check that that's true.
-      Dtype estimated_gradient = 0;
-      Dtype positive_objective = 0;
-      Dtype negative_objective = 0;
-      if (!element_wise || (feat_id == top_data_id)) {
-        // Do finite differencing.
-        // Compute loss with stepsize_ added to input.
-        current_blob->mutable_cpu_data()[feat_id] += stepsize_;
-        Caffe::set_random_seed(seed_);
-        layer->Forward(bottom, top);
-        positive_objective =
-            GetObjAndGradient(*layer, top, top_id, top_data_id);
-        // Compute loss with stepsize_ subtracted from input.
-        current_blob->mutable_cpu_data()[feat_id] -= stepsize_ * 2;
-        Caffe::set_random_seed(seed_);
-        layer->Forward(bottom, top);
-        negative_objective =
-            GetObjAndGradient(*layer, top, top_id, top_data_id);
-        // Recover original input value.
-        current_blob->mutable_cpu_data()[feat_id] += stepsize_;
-        estimated_gradient = (positive_objective - negative_objective) /
-            stepsize_ / 2.;
-      }
-      Dtype computed_gradient = computed_gradients[feat_id];
-      Dtype feature = current_blob->cpu_data()[feat_id];
-      // LOG(ERROR) << "debug: " << current_blob->cpu_data()[feat_id] << " "
-      //     << current_blob->cpu_diff()[feat_id];
-      if (kink_ - kink_range_ > fabs(feature)
-          || fabs(feature) > kink_ + kink_range_) {
-        // We check relative accuracy, but for too small values, we threshold
-        // the scale factor by 1.
-        Dtype scale = std::max<Dtype>(
-            std::max(fabs(computed_gradient), fabs(estimated_gradient)),
-            Dtype(1.));
-        EXPECT_NEAR(computed_gradient, estimated_gradient, threshold_ * scale)
-          << "debug: (top_id, top_data_id, blob_id, feat_id)="
-          << top_id << "," << top_data_id << "," << blob_id << "," << feat_id
-          << "; feat = " << feature
-          << "; objective+ = " << positive_objective
-          << "; objective- = " << negative_objective;
-      }
-      // LOG(ERROR) << "Feature: " << current_blob->cpu_data()[feat_id];
-      // LOG(ERROR) << "computed gradient: " << computed_gradient
-      //    << " estimated_gradient: " << estimated_gradient;
-    }
-  }
-}
-
-template <typename Dtype>
-void GradientChecker<Dtype>::CheckGradientExhaustive(Layer<Dtype>* layer,
-    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top,
-    int check_bottom) {
-  layer->SetUp(bottom, top);
-  CHECK_GT(top.size(), 0) << "Exhaustive mode requires at least one top blob.";
-  // LOG(ERROR) << "Exhaustive Mode.";
-  for (int i = 0; i < top.size(); ++i) {
-    // LOG(ERROR) << "Exhaustive: blob " << i << " size " << top[i]->count();
-    for (int j = 0; j < top[i]->count(); ++j) {
-      // LOG(ERROR) << "Exhaustive: blob " << i << " data " << j;
-      CheckGradientSingle(layer, bottom, top, check_bottom, i, j);
-    }
-  }
-}
-
-template <typename Dtype>
-void GradientChecker<Dtype>::CheckGradientEltwise(Layer<Dtype>* layer,
-    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  layer->SetUp(bottom, top);
-  CHECK_GT(top.size(), 0) << "Eltwise mode requires at least one top blob.";
-  const int check_bottom = -1;
-  const bool element_wise = true;
-  for (int i = 0; i < top.size(); ++i) {
-    for (int j = 0; j < top[i]->count(); ++j) {
-      CheckGradientSingle(layer, bottom, top, check_bottom, i, j, element_wise);
-    }
-  }
-}
-
-template <typename Dtype>
-void GradientChecker<Dtype>::CheckGradientNet(
-    const Net<Dtype>& net, const vector<Blob<Dtype>*>& input) {
-  const vector<shared_ptr<Layer<Dtype> > >& layers = net.layers();
-  vector<vector<Blob<Dtype>*> >& bottom_vecs = net.bottom_vecs();
-  vector<vector<Blob<Dtype>*> >& top_vecs = net.top_vecs();
-  for (int i = 0; i < layers.size(); ++i) {
-    net.Forward(input);
-    LOG(ERROR) << "Checking gradient for " << layers[i]->layer_param().name();
-    CheckGradientExhaustive(*(layers[i].get()), bottom_vecs[i], top_vecs[i]);
-  }
-}
-
-template <typename Dtype>
-Dtype GradientChecker<Dtype>::GetObjAndGradient(const Layer<Dtype>& layer,
-    const vector<Blob<Dtype>*>& top, int top_id, int top_data_id) {
-  Dtype loss = 0;
-  if (top_id < 0) {
-    // the loss will be half of the sum of squares of all outputs
-    for (int i = 0; i < top.size(); ++i) {
-      Blob<Dtype>* top_blob = top[i];
-      const Dtype* top_blob_data = top_blob->cpu_data();
-      Dtype* top_blob_diff = top_blob->mutable_cpu_diff();
-      int count = top_blob->count();
-      for (int j = 0; j < count; ++j) {
-        loss += top_blob_data[j] * top_blob_data[j];
-      }
-      // set the diff: simply the data.
-      caffe_copy(top_blob->count(), top_blob_data, top_blob_diff);
-    }
-    loss /= 2.;
-  } else {
-    // the loss will be the top_data_id-th element in the top_id-th blob.
-    for (int i = 0; i < top.size(); ++i) {
-      Blob<Dtype>* top_blob = top[i];
-      Dtype* top_blob_diff = top_blob->mutable_cpu_diff();
-      caffe_set(top_blob->count(), Dtype(0), top_blob_diff);
-    }
-    const Dtype loss_weight = 2;
-    loss = top[top_id]->cpu_data()[top_data_id] * loss_weight;
-    top[top_id]->mutable_cpu_diff()[top_data_id] = loss_weight;
-  }
-  return loss;
-}
-
-}  // namespace caffe
-
-#endif  // CAFFE_TEST_GRADIENT_CHECK_UTIL_H_
diff --git a/include/caffe/util/benchmark.hpp b/include/caffe/util/benchmark.hpp
deleted file mode 100644
index d635827..0000000
--- a/include/caffe/util/benchmark.hpp
+++ /dev/null
@@ -1,52 +0,0 @@
-#ifndef CAFFE_UTIL_BENCHMARK_H_
-#define CAFFE_UTIL_BENCHMARK_H_
-
-#include <boost/date_time/posix_time/posix_time.hpp>
-
-#include "caffe/util/device_alternate.hpp"
-
-namespace caffe {
-
-class Timer {
- public:
-  Timer();
-  virtual ~Timer();
-  virtual void Start();
-  virtual void Stop();
-  virtual float MilliSeconds();
-  virtual float MicroSeconds();
-  virtual float Seconds();
-
-  inline bool initted() { return initted_; }
-  inline bool running() { return running_; }
-  inline bool has_run_at_least_once() { return has_run_at_least_once_; }
-
- protected:
-  void Init();
-
-  bool initted_;
-  bool running_;
-  bool has_run_at_least_once_;
-#ifndef CPU_ONLY
-  cudaEvent_t start_gpu_;
-  cudaEvent_t stop_gpu_;
-#endif
-  boost::posix_time::ptime start_cpu_;
-  boost::posix_time::ptime stop_cpu_;
-  float elapsed_milliseconds_;
-  float elapsed_microseconds_;
-};
-
-class CPUTimer : public Timer {
- public:
-  explicit CPUTimer();
-  virtual ~CPUTimer() {}
-  virtual void Start();
-  virtual void Stop();
-  virtual float MilliSeconds();
-  virtual float MicroSeconds();
-};
-
-}  // namespace caffe
-
-#endif   // CAFFE_UTIL_BENCHMARK_H_
diff --git a/include/caffe/util/blocking_queue.hpp b/include/caffe/util/blocking_queue.hpp
deleted file mode 100644
index d3de2e5..0000000
--- a/include/caffe/util/blocking_queue.hpp
+++ /dev/null
@@ -1,45 +0,0 @@
-#ifndef CAFFE_UTIL_BLOCKING_QUEUE_HPP_
-#define CAFFE_UTIL_BLOCKING_QUEUE_HPP_
-
-#include <queue>
-#include <string>
-
-namespace caffe {
-
-template<typename T>
-class BlockingQueue {
- public:
-  explicit BlockingQueue();
-
-  void push(const T& t);
-
-  bool try_pop(T* t);
-
-  // This logs a message if the threads needs to be blocked
-  // useful for detecting e.g. when data feeding is too slow
-  T pop(const string& log_on_wait = "");
-
-  bool try_peek(T* t);
-
-  // Return element without removing it
-  T peek();
-
-  size_t size() const;
-
- protected:
-  /**
-   Move synchronization fields out instead of including boost/thread.hpp
-   to avoid a boost/NVCC issues (#1009, #1010) on OSX. Also fails on
-   Linux CUDA 7.0.18.
-   */
-  class sync;
-
-  std::queue<T> queue_;
-  shared_ptr<sync> sync_;
-
-DISABLE_COPY_AND_ASSIGN(BlockingQueue);
-};
-
-}  // namespace caffe
-
-#endif
diff --git a/include/caffe/util/cudnn.hpp b/include/caffe/util/cudnn.hpp
deleted file mode 100644
index a7d8dbb..0000000
--- a/include/caffe/util/cudnn.hpp
+++ /dev/null
@@ -1,153 +0,0 @@
-#ifndef CAFFE_UTIL_CUDNN_H_
-#define CAFFE_UTIL_CUDNN_H_
-#ifdef USE_CUDNN
-
-#include <cudnn.h>
-
-#include "caffe/common.hpp"
-#include "caffe/proto/caffe.pb.h"
-
-#define CUDNN_VERSION_MIN(major, minor, patch) \
-    (CUDNN_VERSION >= (major * 1000 + minor * 100 + patch))
-
-#define CUDNN_CHECK(condition) \
-  do { \
-    cudnnStatus_t status = condition; \
-    CHECK_EQ(status, CUDNN_STATUS_SUCCESS) << " "\
-      << cudnnGetErrorString(status); \
-  } while (0)
-
-inline const char* cudnnGetErrorString(cudnnStatus_t status) {
-  switch (status) {
-    case CUDNN_STATUS_SUCCESS:
-      return "CUDNN_STATUS_SUCCESS";
-    case CUDNN_STATUS_NOT_INITIALIZED:
-      return "CUDNN_STATUS_NOT_INITIALIZED";
-    case CUDNN_STATUS_ALLOC_FAILED:
-      return "CUDNN_STATUS_ALLOC_FAILED";
-    case CUDNN_STATUS_BAD_PARAM:
-      return "CUDNN_STATUS_BAD_PARAM";
-    case CUDNN_STATUS_INTERNAL_ERROR:
-      return "CUDNN_STATUS_INTERNAL_ERROR";
-    case CUDNN_STATUS_INVALID_VALUE:
-      return "CUDNN_STATUS_INVALID_VALUE";
-    case CUDNN_STATUS_ARCH_MISMATCH:
-      return "CUDNN_STATUS_ARCH_MISMATCH";
-    case CUDNN_STATUS_MAPPING_ERROR:
-      return "CUDNN_STATUS_MAPPING_ERROR";
-    case CUDNN_STATUS_EXECUTION_FAILED:
-      return "CUDNN_STATUS_EXECUTION_FAILED";
-    case CUDNN_STATUS_NOT_SUPPORTED:
-      return "CUDNN_STATUS_NOT_SUPPORTED";
-    case CUDNN_STATUS_LICENSE_ERROR:
-      return "CUDNN_STATUS_LICENSE_ERROR";
-  }
-  return "Unknown cudnn status";
-}
-
-namespace caffe {
-
-namespace cudnn {
-
-template <typename Dtype> class dataType;
-template<> class dataType<float>  {
- public:
-  static const cudnnDataType_t type = CUDNN_DATA_FLOAT;
-  static float oneval, zeroval;
-  static const void *one, *zero;
-};
-template<> class dataType<double> {
- public:
-  static const cudnnDataType_t type = CUDNN_DATA_DOUBLE;
-  static double oneval, zeroval;
-  static const void *one, *zero;
-};
-
-template <typename Dtype>
-inline void createTensor4dDesc(cudnnTensorDescriptor_t* desc) {
-  CUDNN_CHECK(cudnnCreateTensorDescriptor(desc));
-}
-
-template <typename Dtype>
-inline void setTensor4dDesc(cudnnTensorDescriptor_t* desc,
-    int n, int c, int h, int w,
-    int stride_n, int stride_c, int stride_h, int stride_w) {
-  CUDNN_CHECK(cudnnSetTensor4dDescriptorEx(*desc, dataType<Dtype>::type,
-        n, c, h, w, stride_n, stride_c, stride_h, stride_w));
-}
-
-template <typename Dtype>
-inline void setTensor4dDesc(cudnnTensorDescriptor_t* desc,
-    int n, int c, int h, int w) {
-  const int stride_w = 1;
-  const int stride_h = w * stride_w;
-  const int stride_c = h * stride_h;
-  const int stride_n = c * stride_c;
-  setTensor4dDesc<Dtype>(desc, n, c, h, w,
-                         stride_n, stride_c, stride_h, stride_w);
-}
-
-template <typename Dtype>
-inline void createFilterDesc(cudnnFilterDescriptor_t* desc,
-    int n, int c, int h, int w) {
-  CUDNN_CHECK(cudnnCreateFilterDescriptor(desc));
-#if CUDNN_VERSION_MIN(5, 0, 0)
-  CUDNN_CHECK(cudnnSetFilter4dDescriptor(*desc, dataType<Dtype>::type,
-      CUDNN_TENSOR_NCHW, n, c, h, w));
-#else
-  CUDNN_CHECK(cudnnSetFilter4dDescriptor_v4(*desc, dataType<Dtype>::type,
-      CUDNN_TENSOR_NCHW, n, c, h, w));
-#endif
-}
-
-template <typename Dtype>
-inline void createConvolutionDesc(cudnnConvolutionDescriptor_t* conv) {
-  CUDNN_CHECK(cudnnCreateConvolutionDescriptor(conv));
-}
-
-template <typename Dtype>
-inline void setConvolutionDesc(cudnnConvolutionDescriptor_t* conv,
-    cudnnTensorDescriptor_t bottom, cudnnFilterDescriptor_t filter,
-    int pad_h, int pad_w, int stride_h, int stride_w) {
-  CUDNN_CHECK(cudnnSetConvolution2dDescriptor(*conv,
-      pad_h, pad_w, stride_h, stride_w, 1, 1, CUDNN_CROSS_CORRELATION));
-}
-
-template <typename Dtype>
-inline void createPoolingDesc(cudnnPoolingDescriptor_t* pool_desc,
-    PoolingParameter_PoolMethod poolmethod, cudnnPoolingMode_t* mode,
-    int h, int w, int pad_h, int pad_w, int stride_h, int stride_w) {
-  switch (poolmethod) {
-  case PoolingParameter_PoolMethod_MAX:
-    *mode = CUDNN_POOLING_MAX;
-    break;
-  case PoolingParameter_PoolMethod_AVE:
-    *mode = CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING;
-    break;
-  default:
-    LOG(FATAL) << "Unknown pooling method.";
-  }
-  CUDNN_CHECK(cudnnCreatePoolingDescriptor(pool_desc));
-#if CUDNN_VERSION_MIN(5, 0, 0)
-  CUDNN_CHECK(cudnnSetPooling2dDescriptor(*pool_desc, *mode,
-        CUDNN_PROPAGATE_NAN, h, w, pad_h, pad_w, stride_h, stride_w));
-#else
-  CUDNN_CHECK(cudnnSetPooling2dDescriptor_v4(*pool_desc, *mode,
-        CUDNN_PROPAGATE_NAN, h, w, pad_h, pad_w, stride_h, stride_w));
-#endif
-}
-
-template <typename Dtype>
-inline void createActivationDescriptor(cudnnActivationDescriptor_t* activ_desc,
-    cudnnActivationMode_t mode) {
-  CUDNN_CHECK(cudnnCreateActivationDescriptor(activ_desc));
-  CUDNN_CHECK(cudnnSetActivationDescriptor(*activ_desc, mode,
-                                           CUDNN_PROPAGATE_NAN, Dtype(0)));
-}
-
-}  // namespace cudnn
-
-}  // namespace caffe
-
-#endif  // USE_CUDNN
-#endif  // CAFFE_UTIL_CUDNN_H_
diff --git a/include/caffe/util/db.hpp b/include/caffe/util/db.hpp
deleted file mode 100644
index 59ec3d3..0000000
--- a/include/caffe/util/db.hpp
+++ /dev/null
@@ -1,54 +0,0 @@
-#ifndef CAFFE_UTIL_DB_HPP
-#define CAFFE_UTIL_DB_HPP
-
-#include <string>
-
-#include "caffe/common.hpp"
-#include "caffe/proto/caffe.pb.h"
-
-namespace caffe { namespace db {
-
-enum Mode { READ, WRITE, NEW };
-
-class Cursor {
- public:
-  Cursor() { }
-  virtual ~Cursor() { }
-  virtual void SeekToFirst() = 0;
-  virtual void Next() = 0;
-  virtual string key() = 0;
-  virtual string value() = 0;
-  virtual bool valid() = 0;
-
-  DISABLE_COPY_AND_ASSIGN(Cursor);
-};
-
-class Transaction {
- public:
-  Transaction() { }
-  virtual ~Transaction() { }
-  virtual void Put(const string& key, const string& value) = 0;
-  virtual void Commit() = 0;
-
-  DISABLE_COPY_AND_ASSIGN(Transaction);
-};
-
-class DB {
- public:
-  DB() { }
-  virtual ~DB() { }
-  virtual void Open(const string& source, Mode mode) = 0;
-  virtual void Close() = 0;
-  virtual Cursor* NewCursor() = 0;
-  virtual Transaction* NewTransaction() = 0;
-
-  DISABLE_COPY_AND_ASSIGN(DB);
-};
-
-DB* GetDB(DataParameter::DB backend);
-DB* GetDB(const string& backend);
-
-}  // namespace db
-}  // namespace caffe
-
-#endif  // CAFFE_UTIL_DB_HPP
diff --git a/include/caffe/util/db_leveldb.hpp b/include/caffe/util/db_leveldb.hpp
deleted file mode 100644
index e9fa0d3..0000000
--- a/include/caffe/util/db_leveldb.hpp
+++ /dev/null
@@ -1,75 +0,0 @@
-#ifdef USE_LEVELDB
-#ifndef CAFFE_UTIL_DB_LEVELDB_HPP
-#define CAFFE_UTIL_DB_LEVELDB_HPP
-
-#include <string>
-
-#include "leveldb/db.h"
-#include "leveldb/write_batch.h"
-
-#include "caffe/util/db.hpp"
-
-namespace caffe { namespace db {
-
-class LevelDBCursor : public Cursor {
- public:
-  explicit LevelDBCursor(leveldb::Iterator* iter)
-    : iter_(iter) { SeekToFirst(); }
-  ~LevelDBCursor() { delete iter_; }
-  virtual void SeekToFirst() { iter_->SeekToFirst(); }
-  virtual void Next() { iter_->Next(); }
-  virtual string key() { return iter_->key().ToString(); }
-  virtual string value() { return iter_->value().ToString(); }
-  virtual bool valid() { return iter_->Valid(); }
-
- private:
-  leveldb::Iterator* iter_;
-};
-
-class LevelDBTransaction : public Transaction {
- public:
-  explicit LevelDBTransaction(leveldb::DB* db) : db_(db) { CHECK_NOTNULL(db_); }
-  virtual void Put(const string& key, const string& value) {
-    batch_.Put(key, value);
-  }
-  virtual void Commit() {
-    leveldb::Status status = db_->Write(leveldb::WriteOptions(), &batch_);
-    CHECK(status.ok()) << "Failed to write batch to leveldb "
-                       << std::endl << status.ToString();
-  }
-
- private:
-  leveldb::DB* db_;
-  leveldb::WriteBatch batch_;
-
-  DISABLE_COPY_AND_ASSIGN(LevelDBTransaction);
-};
-
-class LevelDB : public DB {
- public:
-  LevelDB() : db_(NULL) { }
-  virtual ~LevelDB() { Close(); }
-  virtual void Open(const string& source, Mode mode);
-  virtual void Close() {
-    if (db_ != NULL) {
-      delete db_;
-      db_ = NULL;
-    }
-  }
-  virtual LevelDBCursor* NewCursor() {
-    return new LevelDBCursor(db_->NewIterator(leveldb::ReadOptions()));
-  }
-  virtual LevelDBTransaction* NewTransaction() {
-    return new LevelDBTransaction(db_);
-  }
-
- private:
-  leveldb::DB* db_;
-};
-
-
-}  // namespace db
-}  // namespace caffe
-
-#endif  // CAFFE_UTIL_DB_LEVELDB_HPP
-#endif  // USE_LEVELDB
diff --git a/include/caffe/util/db_lmdb.hpp b/include/caffe/util/db_lmdb.hpp
deleted file mode 100644
index ee37032..0000000
--- a/include/caffe/util/db_lmdb.hpp
+++ /dev/null
@@ -1,96 +0,0 @@
-#ifdef USE_LMDB
-#ifndef CAFFE_UTIL_DB_LMDB_HPP
-#define CAFFE_UTIL_DB_LMDB_HPP
-
-#include <string>
-#include <vector>
-
-#include "lmdb.h"
-
-#include "caffe/util/db.hpp"
-
-namespace caffe { namespace db {
-
-inline void MDB_CHECK(int mdb_status) {
-  CHECK_EQ(mdb_status, MDB_SUCCESS) << mdb_strerror(mdb_status);
-}
-
-class LMDBCursor : public Cursor {
- public:
-  explicit LMDBCursor(MDB_txn* mdb_txn, MDB_cursor* mdb_cursor)
-    : mdb_txn_(mdb_txn), mdb_cursor_(mdb_cursor), valid_(false) {
-    SeekToFirst();
-  }
-  virtual ~LMDBCursor() {
-    mdb_cursor_close(mdb_cursor_);
-    mdb_txn_abort(mdb_txn_);
-  }
-  virtual void SeekToFirst() { Seek(MDB_FIRST); }
-  virtual void Next() { Seek(MDB_NEXT); }
-  virtual string key() {
-    return string(static_cast<const char*>(mdb_key_.mv_data), mdb_key_.mv_size);
-  }
-  virtual string value() {
-    return string(static_cast<const char*>(mdb_value_.mv_data),
-        mdb_value_.mv_size);
-  }
-  virtual bool valid() { return valid_; }
-
- private:
-  void Seek(MDB_cursor_op op) {
-    int mdb_status = mdb_cursor_get(mdb_cursor_, &mdb_key_, &mdb_value_, op);
-    if (mdb_status == MDB_NOTFOUND) {
-      valid_ = false;
-    } else {
-      MDB_CHECK(mdb_status);
-      valid_ = true;
-    }
-  }
-
-  MDB_txn* mdb_txn_;
-  MDB_cursor* mdb_cursor_;
-  MDB_val mdb_key_, mdb_value_;
-  bool valid_;
-};
-
-class LMDBTransaction : public Transaction {
- public:
-  explicit LMDBTransaction(MDB_env* mdb_env)
-    : mdb_env_(mdb_env) { }
-  virtual void Put(const string& key, const string& value);
-  virtual void Commit();
-
- private:
-  MDB_env* mdb_env_;
-  vector<string> keys, values;
-
-  void DoubleMapSize();
-
-  DISABLE_COPY_AND_ASSIGN(LMDBTransaction);
-};
-
-class LMDB : public DB {
- public:
-  LMDB() : mdb_env_(NULL) { }
-  virtual ~LMDB() { Close(); }
-  virtual void Open(const string& source, Mode mode);
-  virtual void Close() {
-    if (mdb_env_ != NULL) {
-      mdb_dbi_close(mdb_env_, mdb_dbi_);
-      mdb_env_close(mdb_env_);
-      mdb_env_ = NULL;
-    }
-  }
-  virtual LMDBCursor* NewCursor();
-  virtual LMDBTransaction* NewTransaction();
-
- private:
-  MDB_env* mdb_env_;
-  MDB_dbi mdb_dbi_;
-};
-
-}  // namespace db
-}  // namespace caffe
-
-#endif  // CAFFE_UTIL_DB_LMDB_HPP
-#endif  // USE_LMDB
diff --git a/include/caffe/util/device_alternate.hpp b/include/caffe/util/device_alternate.hpp
deleted file mode 100644
index e3fe4fe..0000000
--- a/include/caffe/util/device_alternate.hpp
+++ /dev/null
@@ -1,96 +0,0 @@
-#ifndef CAFFE_UTIL_DEVICE_ALTERNATE_H_
-#define CAFFE_UTIL_DEVICE_ALTERNATE_H_
-
-#ifdef CPU_ONLY  // CPU-only Caffe.
-
-#include <vector>
-
-// Stub out GPU calls as unavailable.
-
-#define NO_GPU LOG(FATAL) << "Cannot use GPU in CPU-only Caffe: check mode."
-
-#define STUB_GPU(classname) \
-template <typename Dtype> \
-void classname<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom, \
-    const vector<Blob<Dtype>*>& top) { NO_GPU; } \
-template <typename Dtype> \
-void classname<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top, \
-    const vector<bool>& propagate_down, \
-    const vector<Blob<Dtype>*>& bottom) { NO_GPU; } \
-
-#define STUB_GPU_FORWARD(classname, funcname) \
-template <typename Dtype> \
-void classname<Dtype>::funcname##_##gpu(const vector<Blob<Dtype>*>& bottom, \
-    const vector<Blob<Dtype>*>& top) { NO_GPU; } \
-
-#define STUB_GPU_BACKWARD(classname, funcname) \
-template <typename Dtype> \
-void classname<Dtype>::funcname##_##gpu(const vector<Blob<Dtype>*>& top, \
-    const vector<bool>& propagate_down, \
-    const vector<Blob<Dtype>*>& bottom) { NO_GPU; } \
-
-#else  // Normal GPU + CPU Caffe.
-
-#include <cublas_v2.h>
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <curand.h>
-#include <driver_types.h>  // cuda driver types
-#ifdef USE_CUDNN  // cuDNN acceleration library.
-#include "caffe/util/cudnn.hpp"
-#endif
-
-//
-// CUDA macros
-//
-
-// CUDA: various checks for different function calls.
-#define CUDA_CHECK(condition) \
-  /* Code block avoids redefinition of cudaError_t error */ \
-  do { \
-    cudaError_t error = condition; \
-    CHECK_EQ(error, cudaSuccess) << " " << cudaGetErrorString(error); \
-  } while (0)
-
-#define CUBLAS_CHECK(condition) \
-  do { \
-    cublasStatus_t status = condition; \
-    CHECK_EQ(status, CUBLAS_STATUS_SUCCESS) << " " \
-      << caffe::cublasGetErrorString(status); \
-  } while (0)
-
-#define CURAND_CHECK(condition) \
-  do { \
-    curandStatus_t status = condition; \
-    CHECK_EQ(status, CURAND_STATUS_SUCCESS) << " " \
-      << caffe::curandGetErrorString(status); \
-  } while (0)
-
-// CUDA: grid stride looping
-#define CUDA_KERNEL_LOOP(i, n) \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
-       i < (n); \
-       i += blockDim.x * gridDim.x)
-
-// CUDA: check for error after kernel execution and exit loudly if there is one.
-#define CUDA_POST_KERNEL_CHECK CUDA_CHECK(cudaPeekAtLastError())
-
-namespace caffe {
-
-// CUDA: library error reporting.
-const char* cublasGetErrorString(cublasStatus_t error);
-const char* curandGetErrorString(curandStatus_t error);
-
-// CUDA: use 512 threads per block
-const int CAFFE_CUDA_NUM_THREADS = 512;
-
-// CUDA: number of blocks for threads.
-inline int CAFFE_GET_BLOCKS(const int N) {
-  return (N + CAFFE_CUDA_NUM_THREADS - 1) / CAFFE_CUDA_NUM_THREADS;
-}
-
-}  // namespace caffe
-
-#endif  // CPU_ONLY
-
-#endif  // CAFFE_UTIL_DEVICE_ALTERNATE_H_
diff --git a/include/caffe/util/gpu_util.cuh b/include/caffe/util/gpu_util.cuh
deleted file mode 100644
index 994202f..0000000
--- a/include/caffe/util/gpu_util.cuh
+++ /dev/null
@@ -1,35 +0,0 @@
-#ifndef CAFFE_UTIL_GPU_UTIL_H_
-#define CAFFE_UTIL_GPU_UTIL_H_
-
-namespace caffe {
-
-template <typename Dtype>
-inline __device__ Dtype caffe_gpu_atomic_add(const Dtype val, Dtype* address);
-
-template <>
-inline __device__
-float caffe_gpu_atomic_add(const float val, float* address) {
-  return atomicAdd(address, val);
-}
-
-// double atomicAdd implementation taken from:
-// http://docs.nvidia.com/cuda/cuda-c-programming-guide/#axzz3PVCpVsEG
-template <>
-inline __device__
-double caffe_gpu_atomic_add(const double val, double* address) {
-  unsigned long long int* address_as_ull =  // NOLINT(runtime/int)
-      // NOLINT_NEXT_LINE(runtime/int)
-      reinterpret_cast<unsigned long long int*>(address);
-  unsigned long long int old = *address_as_ull;  // NOLINT(runtime/int)
-  unsigned long long int assumed;  // NOLINT(runtime/int)
-  do {
-    assumed = old;
-    old = atomicCAS(address_as_ull, assumed,
-        __double_as_longlong(val + __longlong_as_double(assumed)));
-  } while (assumed != old);
-  return __longlong_as_double(old);
-}
-
-}  // namespace caffe
-
-#endif  // CAFFE_UTIL_GPU_UTIL_H_
diff --git a/include/caffe/util/io.hpp b/include/caffe/util/io.hpp
index 1a59988..2e53aa9 100644
--- a/include/caffe/util/io.hpp
+++ b/include/caffe/util/io.hpp
@@ -1,7 +1,6 @@
 #ifndef CAFFE_UTIL_IO_H_
 #define CAFFE_UTIL_IO_H_
 
-#include <boost/filesystem.hpp>
 #include <iomanip>
 #include <iostream>  // NOLINT(readability/streams)
 #include <string>
@@ -10,7 +9,6 @@
 
 #include "caffe/common.hpp"
 #include "caffe/proto/caffe.pb.h"
-#include "caffe/util/format.hpp"
 
 #ifndef CAFFE_TMP_DIR_RETRIES
 #define CAFFE_TMP_DIR_RETRIES 100
@@ -19,35 +17,6 @@
 namespace caffe {
 
 using ::google::protobuf::Message;
-using ::boost::filesystem::path;
-
-inline void MakeTempDir(string* temp_dirname) {
-  temp_dirname->clear();
-  const path& model =
-    boost::filesystem::temp_directory_path()/"caffe_test.%%%%-%%%%";
-  for ( int i = 0; i < CAFFE_TMP_DIR_RETRIES; i++ ) {
-    const path& dir = boost::filesystem::unique_path(model).string();
-    bool done = boost::filesystem::create_directory(dir);
-    if ( done ) {
-      *temp_dirname = dir.string();
-      return;
-    }
-  }
-  LOG(FATAL) << "Failed to create a temporary directory.";
-}
-
-inline void MakeTempFilename(string* temp_filename) {
-  static path temp_files_subpath;
-  static uint64_t next_temp_file = 0;
-  temp_filename->clear();
-  if ( temp_files_subpath.empty() ) {
-    string path_string="";
-    MakeTempDir(&path_string);
-    temp_files_subpath = path_string;
-  }
-  *temp_filename =
-    (temp_files_subpath/caffe::format_int(next_temp_file++, 9)).string();
-}
 
 bool ReadProtoFromTextFile(const char* filename, Message* proto);
 
@@ -90,63 +59,6 @@ inline void WriteProtoToBinaryFile(
   WriteProtoToBinaryFile(proto, filename.c_str());
 }
 
-bool ReadFileToDatum(const string& filename, const int label, Datum* datum);
-
-inline bool ReadFileToDatum(const string& filename, Datum* datum) {
-  return ReadFileToDatum(filename, -1, datum);
-}
-
-bool ReadImageToDatum(const string& filename, const int label,
-    const int height, const int width, const bool is_color,
-    const std::string & encoding, Datum* datum);
-
-inline bool ReadImageToDatum(const string& filename, const int label,
-    const int height, const int width, const bool is_color, Datum* datum) {
-  return ReadImageToDatum(filename, label, height, width, is_color,
-                          "", datum);
-}
-
-inline bool ReadImageToDatum(const string& filename, const int label,
-    const int height, const int width, Datum* datum) {
-  return ReadImageToDatum(filename, label, height, width, true, datum);
-}
-
-inline bool ReadImageToDatum(const string& filename, const int label,
-    const bool is_color, Datum* datum) {
-  return ReadImageToDatum(filename, label, 0, 0, is_color, datum);
-}
-
-inline bool ReadImageToDatum(const string& filename, const int label,
-    Datum* datum) {
-  return ReadImageToDatum(filename, label, 0, 0, true, datum);
-}
-
-inline bool ReadImageToDatum(const string& filename, const int label,
-    const std::string & encoding, Datum* datum) {
-  return ReadImageToDatum(filename, label, 0, 0, true, encoding, datum);
-}
-
-bool DecodeDatumNative(Datum* datum);
-bool DecodeDatum(Datum* datum, bool is_color);
-
-#ifdef USE_OPENCV
-cv::Mat ReadImageToCVMat(const string& filename,
-    const int height, const int width, const bool is_color);
-
-cv::Mat ReadImageToCVMat(const string& filename,
-    const int height, const int width);
-
-cv::Mat ReadImageToCVMat(const string& filename,
-    const bool is_color);
-
-cv::Mat ReadImageToCVMat(const string& filename);
-
-cv::Mat DecodeDatumToCVMatNative(const Datum& datum);
-cv::Mat DecodeDatumToCVMat(const Datum& datum, bool is_color);
-
-void CVMatToDatum(const cv::Mat& cv_img, Datum* datum);
-#endif  // USE_OPENCV
-
 }  // namespace caffe
 
 #endif   // CAFFE_UTIL_IO_H_
diff --git a/include/caffe/util/logging.hpp b/include/caffe/util/logging.hpp
new file mode 100644
index 0000000..bb60af4
--- /dev/null
+++ b/include/caffe/util/logging.hpp
@@ -0,0 +1,250 @@
+/*!
+ *  Copyright (c) 2015 by Contributors
+ * \file logging.h
+ * \brief defines logging macros of dmlc
+ *  allows use of GLOG, fall back to internal
+ *  implementation when disabled
+ */
+#ifndef DMLC_LOGGING_H_
+#define DMLC_LOGGING_H_
+#include <cstdio>
+#include <cstdlib>
+#include <string>
+#include <vector>
+#include <stdexcept>
+
+namespace caffe {
+/*!
+ * \brief exception class that will be thrown by
+ *  default logger if DMLC_LOG_FATAL_THROW == 1
+ */
+struct Error : public std::runtime_error {
+  /*!
+   * \brief constructor
+   * \param s the error message
+   */
+  explicit Error(const std::string &s) : std::runtime_error(s) {}
+};
+}  // namespace caffe
+
+// use a light version of glog
+#include <assert.h>
+#include <iostream>
+#include <sstream>
+#include <ctime>
+
+#if defined(_MSC_VER)
+#pragma warning(disable : 4722)
+#endif
+
+namespace caffe {
+inline void InitLogging(const char* argv0) {
+  // DO NOTHING
+}
+
+class LogCheckError {
+ public:
+  LogCheckError() : str(nullptr) {}
+  explicit LogCheckError(const std::string& str_) : str(new std::string(str_)) {}
+  ~LogCheckError() { if (str != nullptr) delete str; }
+  operator bool() {return str != nullptr; }
+  std::string* str;
+};
+
+#define DEFINE_CHECK_FUNC(name, op)                               \
+  template <typename X, typename Y>                               \
+  inline LogCheckError LogCheck##name(const X& x, const Y& y) {   \
+    if (x op y) return LogCheckError();                           \
+    std::ostringstream os;                                        \
+    os << " (" << x << " vs. " << y << ") ";  /* CHECK_XX(x, y) requires x and y can be serialized to string. Use CHECK(x OP y) otherwise. NOLINT(*) */ \
+    return LogCheckError(os.str());                               \
+  }                                                               \
+  inline LogCheckError LogCheck##name(int x, int y) {             \
+    return LogCheck##name<int, int>(x, y);                        \
+  }
+
+#define CHECK_BINARY_OP(name, op, x, y)                               \
+  if (caffe::LogCheckError _check_err = caffe::LogCheck##name(x, y))    \
+    caffe::LogMessageFatal(__FILE__, __LINE__).stream()                \
+      << "Check failed: " << #x " " #op " " #y << *(_check_err.str)
+
+DEFINE_CHECK_FUNC(_LT, <)
+DEFINE_CHECK_FUNC(_GT, >)
+DEFINE_CHECK_FUNC(_LE, <=)
+DEFINE_CHECK_FUNC(_GE, >=)
+DEFINE_CHECK_FUNC(_EQ, ==)
+DEFINE_CHECK_FUNC(_NE, !=)
+
+// Always-on checking
+#define CHECK(x)                                           \
+  if (!(x))                                                \
+    caffe::LogMessageFatal(__FILE__, __LINE__).stream()     \
+      << "Check failed: " #x << ' '
+#define CHECK_LT(x, y) CHECK_BINARY_OP(_LT, <, x, y)
+#define CHECK_GT(x, y) CHECK_BINARY_OP(_GT, >, x, y)
+#define CHECK_LE(x, y) CHECK_BINARY_OP(_LE, <=, x, y)
+#define CHECK_GE(x, y) CHECK_BINARY_OP(_GE, >=, x, y)
+#define CHECK_EQ(x, y) CHECK_BINARY_OP(_EQ, ==, x, y)
+#define CHECK_NE(x, y) CHECK_BINARY_OP(_NE, !=, x, y)
+#define CHECK_NOTNULL(x) \
+  ((x) == NULL ? caffe::LogMessageFatal(__FILE__, __LINE__).stream() << "Check  notnull: "  #x << ' ', (x) : (x)) // NOLINT(*)
+// Debug-only checking.
+#ifdef NDEBUG
+#define DCHECK(x) \
+  while (false) CHECK(x)
+#define DCHECK_LT(x, y) \
+  while (false) CHECK((x) < (y))
+#define DCHECK_GT(x, y) \
+  while (false) CHECK((x) > (y))
+#define DCHECK_LE(x, y) \
+  while (false) CHECK((x) <= (y))
+#define DCHECK_GE(x, y) \
+  while (false) CHECK((x) >= (y))
+#define DCHECK_EQ(x, y) \
+  while (false) CHECK((x) == (y))
+#define DCHECK_NE(x, y) \
+  while (false) CHECK((x) != (y))
+#else
+#define DCHECK(x) CHECK(x)
+#define DCHECK_LT(x, y) CHECK((x) < (y))
+#define DCHECK_GT(x, y) CHECK((x) > (y))
+#define DCHECK_LE(x, y) CHECK((x) <= (y))
+#define DCHECK_GE(x, y) CHECK((x) >= (y))
+#define DCHECK_EQ(x, y) CHECK((x) == (y))
+#define DCHECK_NE(x, y) CHECK((x) != (y))
+#endif  // NDEBUG
+
+#define LOG_INFO caffe::LogMessage(__FILE__, __LINE__)
+
+#define LOG_ERROR LOG_INFO
+#define LOG_WARNING LOG_INFO
+#define LOG_FATAL caffe::LogMessageFatal(__FILE__, __LINE__)
+#define LOG_QFATAL LOG_FATAL
+
+// Poor man version of VLOG
+#define VLOG(x) LOG_INFO.stream()
+
+#define LOG(severity) LOG_##severity.stream()
+#define LG LOG_INFO.stream()
+#define LOG_IF(severity, condition) \
+  !(condition) ? (void)0 : caffe::LogMessageVoidify() & LOG(severity)
+
+#ifdef NDEBUG
+#define LOG_DFATAL LOG_ERROR
+#define DFATAL ERROR
+#define DLOG(severity) true ? (void)0 : caffe::LogMessageVoidify() & LOG(severity)
+#define DLOG_IF(severity, condition) \
+  (true || !(condition)) ? (void)0 : caffe::LogMessageVoidify() & LOG(severity)
+#else
+#define LOG_DFATAL LOG_FATAL
+#define DFATAL FATAL
+#define DLOG(severity) LOG(severity)
+#define DLOG_IF(severity, condition) LOG_IF(severity, condition)
+#endif
+
+// Poor man version of LOG_EVERY_N
+#define LOG_EVERY_N(severity, n) LOG(severity)
+
+class DateLogger {
+ public:
+  DateLogger() {
+#if defined(_MSC_VER)
+    _tzset();
+#endif
+  }
+  const char* HumanDate() {
+#if defined(_MSC_VER)
+    _strtime_s(buffer_, sizeof(buffer_));
+#else
+    time_t time_value = time(NULL);
+    struct tm *pnow;
+#if !defined(_WIN32)
+    struct tm now;
+    pnow = localtime_r(&time_value, &now);
+#else
+    pnow = localtime(&time_value);  // NOLINT(*)
+#endif
+    snprintf(buffer_, sizeof(buffer_), "%02d:%02d:%02d",
+             pnow->tm_hour, pnow->tm_min, pnow->tm_sec);
+#endif
+    return buffer_;
+  }
+
+ private:
+  char buffer_[9];
+};
+
+class LogMessage {
+ public:
+  LogMessage(const char* file, int line)
+      :
+#ifdef __ANDROID__
+        log_stream_(std::cout)
+#else
+        log_stream_(std::cerr)
+#endif
+  {
+    log_stream_ << "[" << pretty_date_.HumanDate() << "] " << file << ":"
+                << line << ": ";
+  }
+  ~LogMessage() { log_stream_ << '\n'; }
+  std::ostream& stream() { return log_stream_; }
+
+ protected:
+  std::ostream& log_stream_;
+
+ private:
+  DateLogger pretty_date_;
+  LogMessage(const LogMessage&);
+  void operator=(const LogMessage&);
+};
+
+// customized logger that can allow user to define where to log the message.
+class CustomLogMessage {
+ public:
+  CustomLogMessage(const char* file, int line) {
+    log_stream_ << "[" << DateLogger().HumanDate() << "] " << file << ":"
+                << line << ": ";
+  }
+  ~CustomLogMessage() {
+    Log(log_stream_.str());
+  }
+  std::ostream& stream() { return log_stream_; }
+  /*!
+   * \brief customized logging of the message.
+   * This function won't be implemented by libdmlc
+   * \param msg The message to be logged.
+   */
+  static void Log(const std::string& msg);
+
+ private:
+  std::ostringstream log_stream_;
+};
+
+class LogMessageFatal : public LogMessage {
+ public:
+  LogMessageFatal(const char* file, int line) : LogMessage(file, line) {}
+  ~LogMessageFatal() {
+    log_stream_ << "\n";
+    abort();
+  }
+
+ private:
+  LogMessageFatal(const LogMessageFatal&);
+  void operator=(const LogMessageFatal&);
+};
+
+// This class is used to explicitly ignore values in the conditional
+// logging macros.  This avoids compiler warnings like "value computed
+// is not used" and "statement has no effect".
+class LogMessageVoidify {
+ public:
+  LogMessageVoidify() {}
+  // This has to be an operator with a precedence lower than << but
+  // higher than "?:". See its usage.
+  void operator&(std::ostream&) {}
+};
+
+}  // namespace caffe
+
+#endif  // DMLC_LOGGING_H_
\ No newline at end of file
diff --git a/include/caffe/util/math_functions.hpp b/include/caffe/util/math_functions.hpp
index 6f6d3fe..7b22ed7 100644
--- a/include/caffe/util/math_functions.hpp
+++ b/include/caffe/util/math_functions.hpp
@@ -4,10 +4,8 @@
 #include <stdint.h>
 #include <cmath>  // for std::fabs and std::signbit
 
-#include "glog/logging.h"
-
 #include "caffe/common.hpp"
-#include "caffe/util/device_alternate.hpp"
+#include "caffe/util/logging.hpp"
 #include "caffe/util/mkl_alternate.hpp"
 
 namespace caffe {
@@ -142,132 +140,6 @@ DEFINE_CAFFE_CPU_UNARY_FUNC(fabs, y[i] = std::fabs(x[i]));
 template <typename Dtype>
 void caffe_cpu_scale(const int n, const Dtype alpha, const Dtype *x, Dtype* y);
 
-#ifndef CPU_ONLY  // GPU
-
-// Decaf gpu gemm provides an interface that is almost the same as the cpu
-// gemm function - following the c convention and calling the fortran-order
-// gpu code under the hood.
-template <typename Dtype>
-void caffe_gpu_gemm(const CBLAS_TRANSPOSE TransA,
-    const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
-    const Dtype alpha, const Dtype* A, const Dtype* B, const Dtype beta,
-    Dtype* C);
-
-template <typename Dtype>
-void caffe_gpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, const int N,
-    const Dtype alpha, const Dtype* A, const Dtype* x, const Dtype beta,
-    Dtype* y);
-
-template <typename Dtype>
-void caffe_gpu_axpy(const int N, const Dtype alpha, const Dtype* X,
-    Dtype* Y);
-
-template <typename Dtype>
-void caffe_gpu_axpby(const int N, const Dtype alpha, const Dtype* X,
-    const Dtype beta, Dtype* Y);
-
-void caffe_gpu_memcpy(const size_t N, const void *X, void *Y);
-
-template <typename Dtype>
-void caffe_gpu_set(const int N, const Dtype alpha, Dtype *X);
-
-inline void caffe_gpu_memset(const size_t N, const int alpha, void* X) {
-#ifndef CPU_ONLY
-  CUDA_CHECK(cudaMemset(X, alpha, N));  // NOLINT(caffe/alt_fn)
-#else
-  NO_GPU;
-#endif
-}
-
-template <typename Dtype>
-void caffe_gpu_add_scalar(const int N, const Dtype alpha, Dtype *X);
-
-template <typename Dtype>
-void caffe_gpu_scal(const int N, const Dtype alpha, Dtype *X);
-
-template <typename Dtype>
-void caffe_gpu_add(const int N, const Dtype* a, const Dtype* b, Dtype* y);
-
-template <typename Dtype>
-void caffe_gpu_sub(const int N, const Dtype* a, const Dtype* b, Dtype* y);
-
-template <typename Dtype>
-void caffe_gpu_mul(const int N, const Dtype* a, const Dtype* b, Dtype* y);
-
-template <typename Dtype>
-void caffe_gpu_div(const int N, const Dtype* a, const Dtype* b, Dtype* y);
-
-template <typename Dtype>
-void caffe_gpu_abs(const int n, const Dtype* a, Dtype* y);
-
-template <typename Dtype>
-void caffe_gpu_exp(const int n, const Dtype* a, Dtype* y);
-
-template <typename Dtype>
-void caffe_gpu_log(const int n, const Dtype* a, Dtype* y);
-
-template <typename Dtype>
-void caffe_gpu_powx(const int n, const Dtype* a, const Dtype b, Dtype* y);
-
-// caffe_gpu_rng_uniform with two arguments generates integers in the range
-// [0, UINT_MAX].
-void caffe_gpu_rng_uniform(const int n, unsigned int* r);
-
-// caffe_gpu_rng_uniform with four arguments generates floats in the range
-// (a, b] (strictly greater than a, less than or equal to b) due to the
-// specification of curandGenerateUniform.  With a = 0, b = 1, just calls
-// curandGenerateUniform; with other limits will shift and scale the outputs
-// appropriately after calling curandGenerateUniform.
-template <typename Dtype>
-void caffe_gpu_rng_uniform(const int n, const Dtype a, const Dtype b, Dtype* r);
-
-template <typename Dtype>
-void caffe_gpu_rng_gaussian(const int n, const Dtype mu, const Dtype sigma,
-                            Dtype* r);
-
-template <typename Dtype>
-void caffe_gpu_rng_bernoulli(const int n, const Dtype p, int* r);
-
-template <typename Dtype>
-void caffe_gpu_dot(const int n, const Dtype* x, const Dtype* y, Dtype* out);
-
-template <typename Dtype>
-void caffe_gpu_asum(const int n, const Dtype* x, Dtype* y);
-
-template<typename Dtype>
-void caffe_gpu_sign(const int n, const Dtype* x, Dtype* y);
-
-template<typename Dtype>
-void caffe_gpu_sgnbit(const int n, const Dtype* x, Dtype* y);
-
-template <typename Dtype>
-void caffe_gpu_fabs(const int n, const Dtype* x, Dtype* y);
-
-template <typename Dtype>
-void caffe_gpu_scale(const int n, const Dtype alpha, const Dtype *x, Dtype* y);
-
-#define DEFINE_AND_INSTANTIATE_GPU_UNARY_FUNC(name, operation) \
-template<typename Dtype> \
-__global__ void name##_kernel(const int n, const Dtype* x, Dtype* y) { \
-  CUDA_KERNEL_LOOP(index, n) { \
-    operation; \
-  } \
-} \
-template <> \
-void caffe_gpu_##name<float>(const int n, const float* x, float* y) { \
-  /* NOLINT_NEXT_LINE(whitespace/operators) */ \
-  name##_kernel<float><<<CAFFE_GET_BLOCKS(n), CAFFE_CUDA_NUM_THREADS>>>( \
-      n, x, y); \
-} \
-template <> \
-void caffe_gpu_##name<double>(const int n, const double* x, double* y) { \
-  /* NOLINT_NEXT_LINE(whitespace/operators) */ \
-  name##_kernel<double><<<CAFFE_GET_BLOCKS(n), CAFFE_CUDA_NUM_THREADS>>>( \
-      n, x, y); \
-}
-
-#endif  // !CPU_ONLY
-
 }  // namespace caffe
 
 #endif  // CAFFE_UTIL_MATH_FUNCTIONS_H_
diff --git a/include/caffe/util/rng.hpp b/include/caffe/util/rng.hpp
index 8f1cf0d..1e52ec3 100644
--- a/include/caffe/util/rng.hpp
+++ b/include/caffe/util/rng.hpp
@@ -3,18 +3,17 @@
 
 #include <algorithm>
 #include <iterator>
-
-#include "boost/random/mersenne_twister.hpp"
-#include "boost/random/uniform_int.hpp"
+#include <random>
 
 #include "caffe/common.hpp"
 
 namespace caffe {
 
-typedef boost::mt19937 rng_t;
+typedef std::mt19937 rng_t;
 
 inline rng_t* caffe_rng() {
-  return static_cast<caffe::rng_t*>(Caffe::rng_stream().generator());
+  static rng_t rng;
+  return static_cast<caffe::rng_t*>(&rng);
 }
 
 // Fisher–Yates algorithm
@@ -23,7 +22,7 @@ inline void shuffle(RandomAccessIterator begin, RandomAccessIterator end,
                     RandomGenerator* gen) {
   typedef typename std::iterator_traits<RandomAccessIterator>::difference_type
       difference_type;
-  typedef typename boost::uniform_int<difference_type> dist_type;
+  typedef typename std::uniform_int<difference_type> dist_type;
 
   difference_type length = std::distance(begin, end);
   if (length <= 0) return;
diff --git a/include/caffe/util/signal_handler.h b/include/caffe/util/signal_handler.h
deleted file mode 100644
index fb84c65..0000000
--- a/include/caffe/util/signal_handler.h
+++ /dev/null
@@ -1,24 +0,0 @@
-#ifndef INCLUDE_CAFFE_UTIL_SIGNAL_HANDLER_H_
-#define INCLUDE_CAFFE_UTIL_SIGNAL_HANDLER_H_
-
-#include "caffe/proto/caffe.pb.h"
-#include "caffe/solver.hpp"
-
-namespace caffe {
-
-class SignalHandler {
- public:
-  // Contructor. Specify what action to take when a signal is received.
-  SignalHandler(SolverAction::Enum SIGINT_action,
-                SolverAction::Enum SIGHUP_action);
-  ~SignalHandler();
-  ActionCallback GetActionFunction();
- private:
-  SolverAction::Enum CheckForSignals() const;
-  SolverAction::Enum SIGINT_action_;
-  SolverAction::Enum SIGHUP_action_;
-};
-
-}  // namespace caffe
-
-#endif  // INCLUDE_CAFFE_UTIL_SIGNAL_HANDLER_H_
diff --git a/mini-caffe.cmake b/mini-caffe.cmake
index d9a7579..c1d5fc6 100644
--- a/mini-caffe.cmake
+++ b/mini-caffe.cmake
@@ -1,35 +1,33 @@
 # mini-caffe.cmake
 
-find_package(OpenCV REQUIRED)
-set(BOOST_DIR $ENV{BOOST_DIR})
-
 include_directories(${CMAKE_CURRENT_LIST_DIR}/3rdparty/include
                     ${CMAKE_CURRENT_LIST_DIR}/3rdparty/include/openblas
                     ${CMAKE_CURRENT_LIST_DIR}/3rdparty/include/google
-                    ${CMAKE_CURRENT_LIST_DIR}/3rdparty/include/gflags
-                    ${CMAKE_CURRENT_LIST_DIR}/3rdparty/include/glog
-                    ${CMAKE_CURRENT_LIST_DIR}/include
-                    ${BOOST_DIR})
+                    ${CMAKE_CURRENT_LIST_DIR}/include)
 
-link_directories(${BOOST_DIR}/stage/lib # for self compiled
-                 ${BOOST_DIR}/lib32-msvc-12.0 # for VS2013
-                 ${CMAKE_CURRENT_LIST_DIR}/3rdparty/lib)
+link_directories(${CMAKE_CURRENT_LIST_DIR}/3rdparty/lib)
 
-set(LIBS debug gflagsd optimized gflags
-         debug gflags_nothreadsd optimized gflags_nothreads
-         debug libglogd optimized libglog
-         debug libprotobufd optimized libprotobuf
+set(LIBS debug libprotobufd optimized libprotobuf
          libopenblas Shlwapi)
 
-file(GLOB SRC ${CMAKE_CURRENT_LIST_DIR}/src/caffe/*.cpp
-              ${CMAKE_CURRENT_LIST_DIR}/src/caffe/layers/*.cpp
-              ${CMAKE_CURRENT_LIST_DIR}/src/caffe/util/*.cpp
-              ${CMAKE_CURRENT_LIST_DIR}/src/caffe/proto/caffe.pb.cc
-              ${CMAKE_CURRENT_LIST_DIR}/include/caffe/*.hpp
-              ${CMAKE_CURRENT_LIST_DIR}/include/caffe/util/*.hpp
-              ${CMAKE_CURRENT_LIST_DIR}/include/caffe/layers/*.hpp
-              ${CMAKE_CURRENT_LIST_DIR}/include/caffe/proto/caffe.pb.h)
+file(GLOB CAFFE_INCLUDE_LAYERS ${CMAKE_CURRENT_LIST_DIR}/include/caffe/layers/*.hpp)
+file(GLOB CAFFE_INCLUDE_UTIL ${CMAKE_CURRENT_LIST_DIR}/include/caffe/util/*.hpp)
+file(GLOB CAFFE_INCLUDE_OTHER ${CMAKE_CURRENT_LIST_DIR}/include/caffe/*.hpp
+                               ${CMAKE_CURRENT_LIST_DIR}/include/caffe/proto/caffe.pb.h)
+file(GLOB CAFFE_SOURCE_LAYERS ${CMAKE_CURRENT_LIST_DIR}/src/caffe/layers/*.cpp)
+file(GLOB CAFFE_SOURCE_UTIL ${CMAKE_CURRENT_LIST_DIR}/src/caffe/util/*.cpp)
+file(GLOB CAFFE_SOURCE_OTHER ${CMAKE_CURRENT_LIST_DIR}/src/caffe/*.cpp
+                             ${CMAKE_CURRENT_LIST_DIR}/src/caffe/proto/caffe.pb.cc)
+
+source_group(include FILES ${CAFFE_INCLUDE_OTHER})
+source_group(include\\layers FILES ${CAFFE_INCLUDE_LAYERS})
+source_group(include\\util FILES ${CAFFE_INCLUDE_UTIL})
+source_group(src FILES ${CAFFE_SOURCE_OTHER})
+source_group(src\\layers FILES ${CAFFE_SOURCE_LAYERS})
+source_group(src\\util FILES ${CAFFE_SOURCE_UTIL})
+
+set(SRC ${CAFFE_INCLUDE_LAYERS} ${CAFFE_INCLUDE_UTIL} ${CAFFE_INCLUDE_OTHER}
+        ${CAFFE_SOURCE_LAYERS} ${CAFFE_SOURCE_UTIL} ${CAFFE_SOURCE_OTHER})
 
-add_definitions(-DCPU_ONLY -DUSE_OPENCV)
 add_library(libcaffe STATIC ${SRC})
-target_link_libraries(libcaffe ${LIBS} ${OpenCV_LIBS})
+target_link_libraries(libcaffe ${LIBS})
diff --git a/src/caffe/CMakeLists.txt b/src/caffe/CMakeLists.txt
deleted file mode 100644
index 8a80c94..0000000
--- a/src/caffe/CMakeLists.txt
+++ /dev/null
@@ -1,40 +0,0 @@
-# generate protobuf sources
-file(GLOB proto_files proto/*.proto)
-caffe_protobuf_generate_cpp_py(${proto_gen_folder} proto_srcs proto_hdrs proto_python ${proto_files})
-
-# include python files either to force generation
-add_library(proto STATIC ${proto_hdrs} ${proto_srcs} ${proto_python})
-set(Caffe_LINKER_LIBS proto ${Caffe_LINKER_LIBS}) # note, crucial to prepend!
-caffe_default_properties(proto)
-
-# --[ Caffe library
-
-# creates 'test_srcs', 'srcs', 'test_cuda', 'cuda' lists
-caffe_pickup_caffe_sources(${PROJECT_SOURCE_DIR})
-
-if(HAVE_CUDA)
-  caffe_cuda_compile(cuda_objs ${cuda})
-  list(APPEND srcs ${cuda_objs} ${cuda})
-endif()
-
-add_library(caffe ${srcs})
-target_link_libraries(caffe proto ${Caffe_LINKER_LIBS})
-caffe_default_properties(caffe)
-set_target_properties(caffe PROPERTIES
-    VERSION   ${CAFFE_TARGET_VERSION}
-    SOVERSION ${CAFFE_TARGET_SOVERSION}
-    )
-
-# ---[ Tests
- add_subdirectory(test)
-
-# ---[ Install
-install(DIRECTORY ${Caffe_INCLUDE_DIR}/caffe DESTINATION include)
-install(FILES ${proto_hdrs} DESTINATION include/caffe/proto)
-install(TARGETS caffe proto EXPORT CaffeTargets DESTINATION lib)
-
-file(WRITE ${PROJECT_BINARY_DIR}/__init__.py)
-list(APPEND proto_python ${PROJECT_BINARY_DIR}/__init__.py)
-install(PROGRAMS ${proto_python} DESTINATION python/caffe/proto)
-
-
diff --git a/src/caffe/blob.cpp b/src/caffe/blob.cpp
index 4a34e4c..3c82ce7 100644
--- a/src/caffe/blob.cpp
+++ b/src/caffe/blob.cpp
@@ -164,14 +164,7 @@ void Blob<Dtype>::Update() {
     break;
   case SyncedMemory::HEAD_AT_GPU:
   case SyncedMemory::SYNCED:
-#ifndef CPU_ONLY
-    // perform computation on GPU
-    caffe_gpu_axpy<Dtype>(count_, Dtype(-1),
-        static_cast<const Dtype*>(diff_->gpu_data()),
-        static_cast<Dtype*>(data_->mutable_gpu_data()));
-#else
     NO_GPU;
-#endif
     break;
   default:
     LOG(FATAL) << "Syncedmem not initialized.";
@@ -196,15 +189,7 @@ Dtype Blob<Dtype>::asum_data() const {
     return caffe_cpu_asum(count_, cpu_data());
   case SyncedMemory::HEAD_AT_GPU:
   case SyncedMemory::SYNCED:
-#ifndef CPU_ONLY
-  {
-    Dtype asum;
-    caffe_gpu_asum(count_, gpu_data(), &asum);
-    return asum;
-  }
-#else
     NO_GPU;
-#endif
   case SyncedMemory::UNINITIALIZED:
     return 0;
   default:
@@ -231,15 +216,7 @@ Dtype Blob<Dtype>::asum_diff() const {
     return caffe_cpu_asum(count_, cpu_diff());
   case SyncedMemory::HEAD_AT_GPU:
   case SyncedMemory::SYNCED:
-#ifndef CPU_ONLY
-  {
-    Dtype asum;
-    caffe_gpu_asum(count_, gpu_diff(), &asum);
-    return asum;
-  }
-#else
     NO_GPU;
-#endif
   case SyncedMemory::UNINITIALIZED:
     return 0;
   default:
@@ -270,12 +247,7 @@ Dtype Blob<Dtype>::sumsq_data() const {
     break;
   case SyncedMemory::HEAD_AT_GPU:
   case SyncedMemory::SYNCED:
-#ifndef CPU_ONLY
-    data = gpu_data();
-    caffe_gpu_dot(count_, data, data, &sumsq);
-#else
     NO_GPU;
-#endif
     break;
   case SyncedMemory::UNINITIALIZED:
     return 0;
@@ -307,13 +279,7 @@ Dtype Blob<Dtype>::sumsq_diff() const {
     break;
   case SyncedMemory::HEAD_AT_GPU:
   case SyncedMemory::SYNCED:
-#ifndef CPU_ONLY
-    diff = gpu_diff();
-    caffe_gpu_dot(count_, diff, diff, &sumsq);
-    break;
-#else
     NO_GPU;
-#endif
   case SyncedMemory::UNINITIALIZED:
     return 0;
   default:
@@ -341,13 +307,7 @@ void Blob<Dtype>::scale_data(Dtype scale_factor) {
     return;
   case SyncedMemory::HEAD_AT_GPU:
   case SyncedMemory::SYNCED:
-#ifndef CPU_ONLY
-    data = mutable_gpu_data();
-    caffe_gpu_scal(count_, scale_factor, data);
-    return;
-#else
     NO_GPU;
-#endif
   case SyncedMemory::UNINITIALIZED:
     return;
   default:
@@ -374,13 +334,7 @@ void Blob<Dtype>::scale_diff(Dtype scale_factor) {
     return;
   case SyncedMemory::HEAD_AT_GPU:
   case SyncedMemory::SYNCED:
-#ifndef CPU_ONLY
-    diff = mutable_gpu_diff();
-    caffe_gpu_scal(count_, scale_factor, diff);
-    return;
-#else
     NO_GPU;
-#endif
   case SyncedMemory::UNINITIALIZED:
     return;
   default:
@@ -420,27 +374,12 @@ void Blob<Dtype>::CopyFrom(const Blob& source, bool copy_diff, bool reshape) {
       LOG(FATAL) << "Trying to copy blobs of different sizes.";
     }
   }
-  switch (Caffe::mode()) {
-  case Caffe::GPU:
-    if (copy_diff) {
-      caffe_copy(count_, source.gpu_diff(),
-          static_cast<Dtype*>(diff_->mutable_gpu_data()));
-    } else {
-      caffe_copy(count_, source.gpu_data(),
-          static_cast<Dtype*>(data_->mutable_gpu_data()));
-    }
-    break;
-  case Caffe::CPU:
-    if (copy_diff) {
-      caffe_copy(count_, source.cpu_diff(),
-          static_cast<Dtype*>(diff_->mutable_cpu_data()));
-    } else {
-      caffe_copy(count_, source.cpu_data(),
-          static_cast<Dtype*>(data_->mutable_cpu_data()));
-    }
-    break;
-  default:
-    LOG(FATAL) << "Unknown caffe mode.";
+  if (copy_diff) {
+    caffe_copy(count_, source.cpu_diff(),
+        static_cast<Dtype*>(diff_->mutable_cpu_data()));
+  } else {
+    caffe_copy(count_, source.cpu_data(),
+        static_cast<Dtype*>(data_->mutable_cpu_data()));
   }
 }
 
diff --git a/src/caffe/common.cpp b/src/caffe/common.cpp
deleted file mode 100644
index e365930..0000000
--- a/src/caffe/common.cpp
+++ /dev/null
@@ -1,326 +0,0 @@
-#include <boost/thread.hpp>
-#include <glog/logging.h>
-#include <cmath>
-#include <cstdio>
-#include <ctime>
-
-#include "caffe/common.hpp"
-#include "caffe/util/rng.hpp"
-
-#include <process.h>
-
-namespace caffe {
-
-// Make sure each thread can have different values.
-static boost::thread_specific_ptr<Caffe> thread_instance_;
-
-Caffe& Caffe::Get() {
-  if (!thread_instance_.get()) {
-    thread_instance_.reset(new Caffe());
-  }
-  return *(thread_instance_.get());
-}
-
-// random seeding
-int64_t cluster_seedgen(void) {
-  int64_t s, seed, pid;
-  FILE* f = fopen("/dev/urandom", "rb");
-  if (f && fread(&seed, 1, sizeof(seed), f) == sizeof(seed)) {
-    fclose(f);
-    return seed;
-  }
-
-  LOG(INFO) << "System entropy source not available, "
-              "using fallback algorithm to generate seed instead.";
-  if (f)
-    fclose(f);
-
-  pid = getpid();
-  s = time(NULL);
-  seed = std::abs(((s * 181) * ((pid - 83) * 359)) % 104729);
-  return seed;
-}
-
-
-void GlobalInit(int* pargc, char*** pargv) {
-  // Google flags.
-  ::gflags::ParseCommandLineFlags(pargc, pargv, true);
-  // Google logging.
-  ::google::InitGoogleLogging(*(pargv)[0]);
-  //// Provide a backtrace on segfault.
-  //::google::InstallFailureSignalHandler();
-}
-
-#ifdef CPU_ONLY  // CPU-only Caffe.
-
-Caffe::Caffe()
-    : random_generator_(), mode_(Caffe::CPU),
-      solver_count_(1), root_solver_(true) { }
-
-Caffe::~Caffe() { }
-
-void Caffe::set_random_seed(const unsigned int seed) {
-  // RNG seed
-  Get().random_generator_.reset(new RNG(seed));
-}
-
-void Caffe::SetDevice(const int device_id) {
-  NO_GPU;
-}
-
-void Caffe::DeviceQuery() {
-  NO_GPU;
-}
-
-bool Caffe::CheckDevice(const int device_id) {
-  NO_GPU;
-  return false;
-}
-
-int Caffe::FindDevice(const int start_id) {
-  NO_GPU;
-  return -1;
-}
-
-class Caffe::RNG::Generator {
- public:
-  Generator() : rng_(new caffe::rng_t(cluster_seedgen())) {}
-  explicit Generator(unsigned int seed) : rng_(new caffe::rng_t(seed)) {}
-  caffe::rng_t* rng() { return rng_.get(); }
- private:
-  shared_ptr<caffe::rng_t> rng_;
-};
-
-Caffe::RNG::RNG() : generator_(new Generator()) { }
-
-Caffe::RNG::RNG(unsigned int seed) : generator_(new Generator(seed)) { }
-
-Caffe::RNG& Caffe::RNG::operator=(const RNG& other) {
-  generator_ = other.generator_;
-  return *this;
-}
-
-void* Caffe::RNG::generator() {
-  return static_cast<void*>(generator_->rng());
-}
-
-#else  // Normal GPU + CPU Caffe.
-
-Caffe::Caffe()
-    : cublas_handle_(NULL), curand_generator_(NULL), random_generator_(),
-    mode_(Caffe::CPU), solver_count_(1), root_solver_(true) {
-  // Try to create a cublas handler, and report an error if failed (but we will
-  // keep the program running as one might just want to run CPU code).
-  if (cublasCreate(&cublas_handle_) != CUBLAS_STATUS_SUCCESS) {
-    LOG(ERROR) << "Cannot create Cublas handle. Cublas won't be available.";
-  }
-  // Try to create a curand handler.
-  if (curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT)
-      != CURAND_STATUS_SUCCESS ||
-      curandSetPseudoRandomGeneratorSeed(curand_generator_, cluster_seedgen())
-      != CURAND_STATUS_SUCCESS) {
-    LOG(ERROR) << "Cannot create Curand generator. Curand won't be available.";
-  }
-}
-
-Caffe::~Caffe() {
-  if (cublas_handle_) CUBLAS_CHECK(cublasDestroy(cublas_handle_));
-  if (curand_generator_) {
-    CURAND_CHECK(curandDestroyGenerator(curand_generator_));
-  }
-}
-
-void Caffe::set_random_seed(const unsigned int seed) {
-  // Curand seed
-  static bool g_curand_availability_logged = false;
-  if (Get().curand_generator_) {
-    CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(curand_generator(),
-        seed));
-    CURAND_CHECK(curandSetGeneratorOffset(curand_generator(), 0));
-  } else {
-    if (!g_curand_availability_logged) {
-        LOG(ERROR) <<
-            "Curand not available. Skipping setting the curand seed.";
-        g_curand_availability_logged = true;
-    }
-  }
-  // RNG seed
-  Get().random_generator_.reset(new RNG(seed));
-}
-
-void Caffe::SetDevice(const int device_id) {
-  int current_device;
-  CUDA_CHECK(cudaGetDevice(&current_device));
-  if (current_device == device_id) {
-    return;
-  }
-  // The call to cudaSetDevice must come before any calls to Get, which
-  // may perform initialization using the GPU.
-  CUDA_CHECK(cudaSetDevice(device_id));
-  if (Get().cublas_handle_) CUBLAS_CHECK(cublasDestroy(Get().cublas_handle_));
-  if (Get().curand_generator_) {
-    CURAND_CHECK(curandDestroyGenerator(Get().curand_generator_));
-  }
-  CUBLAS_CHECK(cublasCreate(&Get().cublas_handle_));
-  CURAND_CHECK(curandCreateGenerator(&Get().curand_generator_,
-      CURAND_RNG_PSEUDO_DEFAULT));
-  CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(Get().curand_generator_,
-      cluster_seedgen()));
-}
-
-void Caffe::DeviceQuery() {
-  cudaDeviceProp prop;
-  int device;
-  if (cudaSuccess != cudaGetDevice(&device)) {
-    printf("No cuda device present.\n");
-    return;
-  }
-  CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
-  LOG(INFO) << "Device id:                     " << device;
-  LOG(INFO) << "Major revision number:         " << prop.major;
-  LOG(INFO) << "Minor revision number:         " << prop.minor;
-  LOG(INFO) << "Name:                          " << prop.name;
-  LOG(INFO) << "Total global memory:           " << prop.totalGlobalMem;
-  LOG(INFO) << "Total shared memory per block: " << prop.sharedMemPerBlock;
-  LOG(INFO) << "Total registers per block:     " << prop.regsPerBlock;
-  LOG(INFO) << "Warp size:                     " << prop.warpSize;
-  LOG(INFO) << "Maximum memory pitch:          " << prop.memPitch;
-  LOG(INFO) << "Maximum threads per block:     " << prop.maxThreadsPerBlock;
-  LOG(INFO) << "Maximum dimension of block:    "
-      << prop.maxThreadsDim[0] << ", " << prop.maxThreadsDim[1] << ", "
-      << prop.maxThreadsDim[2];
-  LOG(INFO) << "Maximum dimension of grid:     "
-      << prop.maxGridSize[0] << ", " << prop.maxGridSize[1] << ", "
-      << prop.maxGridSize[2];
-  LOG(INFO) << "Clock rate:                    " << prop.clockRate;
-  LOG(INFO) << "Total constant memory:         " << prop.totalConstMem;
-  LOG(INFO) << "Texture alignment:             " << prop.textureAlignment;
-  LOG(INFO) << "Concurrent copy and execution: "
-      << (prop.deviceOverlap ? "Yes" : "No");
-  LOG(INFO) << "Number of multiprocessors:     " << prop.multiProcessorCount;
-  LOG(INFO) << "Kernel execution timeout:      "
-      << (prop.kernelExecTimeoutEnabled ? "Yes" : "No");
-  return;
-}
-
-bool Caffe::CheckDevice(const int device_id) {
-  // This function checks the availability of GPU #device_id.
-  // It attempts to create a context on the device by calling cudaFree(0).
-  // cudaSetDevice() alone is not sufficient to check the availability.
-  // It lazily records device_id, however, does not initialize a
-  // context. So it does not know if the host thread has the permission to use
-  // the device or not.
-  //
-  // In a shared environment where the devices are set to EXCLUSIVE_PROCESS
-  // or EXCLUSIVE_THREAD mode, cudaSetDevice() returns cudaSuccess
-  // even if the device is exclusively occupied by another process or thread.
-  // Cuda operations that initialize the context are needed to check
-  // the permission. cudaFree(0) is one of those with no side effect,
-  // except the context initialization.
-  bool r = ((cudaSuccess == cudaSetDevice(device_id)) &&
-            (cudaSuccess == cudaFree(0)));
-  // reset any error that may have occurred.
-  cudaGetLastError();
-  return r;
-}
-
-int Caffe::FindDevice(const int start_id) {
-  // This function finds the first available device by checking devices with
-  // ordinal from start_id to the highest available value. In the
-  // EXCLUSIVE_PROCESS or EXCLUSIVE_THREAD mode, if it succeeds, it also
-  // claims the device due to the initialization of the context.
-  int count = 0;
-  CUDA_CHECK(cudaGetDeviceCount(&count));
-  for (int i = start_id; i < count; i++) {
-    if (CheckDevice(i)) return i;
-  }
-  return -1;
-}
-
-class Caffe::RNG::Generator {
- public:
-  Generator() : rng_(new caffe::rng_t(cluster_seedgen())) {}
-  explicit Generator(unsigned int seed) : rng_(new caffe::rng_t(seed)) {}
-  caffe::rng_t* rng() { return rng_.get(); }
- private:
-  shared_ptr<caffe::rng_t> rng_;
-};
-
-Caffe::RNG::RNG() : generator_(new Generator()) { }
-
-Caffe::RNG::RNG(unsigned int seed) : generator_(new Generator(seed)) { }
-
-Caffe::RNG& Caffe::RNG::operator=(const RNG& other) {
-  generator_.reset(other.generator_.get());
-  return *this;
-}
-
-void* Caffe::RNG::generator() {
-  return static_cast<void*>(generator_->rng());
-}
-
-const char* cublasGetErrorString(cublasStatus_t error) {
-  switch (error) {
-  case CUBLAS_STATUS_SUCCESS:
-    return "CUBLAS_STATUS_SUCCESS";
-  case CUBLAS_STATUS_NOT_INITIALIZED:
-    return "CUBLAS_STATUS_NOT_INITIALIZED";
-  case CUBLAS_STATUS_ALLOC_FAILED:
-    return "CUBLAS_STATUS_ALLOC_FAILED";
-  case CUBLAS_STATUS_INVALID_VALUE:
-    return "CUBLAS_STATUS_INVALID_VALUE";
-  case CUBLAS_STATUS_ARCH_MISMATCH:
-    return "CUBLAS_STATUS_ARCH_MISMATCH";
-  case CUBLAS_STATUS_MAPPING_ERROR:
-    return "CUBLAS_STATUS_MAPPING_ERROR";
-  case CUBLAS_STATUS_EXECUTION_FAILED:
-    return "CUBLAS_STATUS_EXECUTION_FAILED";
-  case CUBLAS_STATUS_INTERNAL_ERROR:
-    return "CUBLAS_STATUS_INTERNAL_ERROR";
-#if CUDA_VERSION >= 6000
-  case CUBLAS_STATUS_NOT_SUPPORTED:
-    return "CUBLAS_STATUS_NOT_SUPPORTED";
-#endif
-#if CUDA_VERSION >= 6050
-  case CUBLAS_STATUS_LICENSE_ERROR:
-    return "CUBLAS_STATUS_LICENSE_ERROR";
-#endif
-  }
-  return "Unknown cublas status";
-}
-
-const char* curandGetErrorString(curandStatus_t error) {
-  switch (error) {
-  case CURAND_STATUS_SUCCESS:
-    return "CURAND_STATUS_SUCCESS";
-  case CURAND_STATUS_VERSION_MISMATCH:
-    return "CURAND_STATUS_VERSION_MISMATCH";
-  case CURAND_STATUS_NOT_INITIALIZED:
-    return "CURAND_STATUS_NOT_INITIALIZED";
-  case CURAND_STATUS_ALLOCATION_FAILED:
-    return "CURAND_STATUS_ALLOCATION_FAILED";
-  case CURAND_STATUS_TYPE_ERROR:
-    return "CURAND_STATUS_TYPE_ERROR";
-  case CURAND_STATUS_OUT_OF_RANGE:
-    return "CURAND_STATUS_OUT_OF_RANGE";
-  case CURAND_STATUS_LENGTH_NOT_MULTIPLE:
-    return "CURAND_STATUS_LENGTH_NOT_MULTIPLE";
-  case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
-    return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED";
-  case CURAND_STATUS_LAUNCH_FAILURE:
-    return "CURAND_STATUS_LAUNCH_FAILURE";
-  case CURAND_STATUS_PREEXISTING_FAILURE:
-    return "CURAND_STATUS_PREEXISTING_FAILURE";
-  case CURAND_STATUS_INITIALIZATION_FAILED:
-    return "CURAND_STATUS_INITIALIZATION_FAILED";
-  case CURAND_STATUS_ARCH_MISMATCH:
-    return "CURAND_STATUS_ARCH_MISMATCH";
-  case CURAND_STATUS_INTERNAL_ERROR:
-    return "CURAND_STATUS_INTERNAL_ERROR";
-  }
-  return "Unknown curand status";
-}
-
-#endif  // CPU_ONLY
-
-}  // namespace caffe
diff --git a/src/caffe/data_reader.cpp b/src/caffe/data_reader.cpp
deleted file mode 100644
index 9f019bb..0000000
--- a/src/caffe/data_reader.cpp
+++ /dev/null
@@ -1,119 +0,0 @@
-#include <boost/thread.hpp>
-#include <map>
-#include <string>
-#include <vector>
-
-#include "caffe/common.hpp"
-#include "caffe/data_reader.hpp"
-#include "caffe/layers/data_layer.hpp"
-#include "caffe/proto/caffe.pb.h"
-
-namespace caffe {
-
-using boost::weak_ptr;
-
-map<const string, weak_ptr<DataReader::Body> > DataReader::bodies_;
-static boost::mutex bodies_mutex_;
-
-DataReader::DataReader(const LayerParameter& param)
-    : queue_pair_(new QueuePair(  //
-        param.data_param().prefetch() * param.data_param().batch_size())) {
-  // Get or create a body
-  boost::mutex::scoped_lock lock(bodies_mutex_);
-  string key = source_key(param);
-  weak_ptr<Body>& weak = bodies_[key];
-  body_ = weak.lock();
-  if (!body_) {
-    body_.reset(new Body(param));
-    bodies_[key] = weak_ptr<Body>(body_);
-  }
-  body_->new_queue_pairs_.push(queue_pair_);
-}
-
-DataReader::~DataReader() {
-  string key = source_key(body_->param_);
-  body_.reset();
-  boost::mutex::scoped_lock lock(bodies_mutex_);
-  if (bodies_[key].expired()) {
-    bodies_.erase(key);
-  }
-}
-
-//
-
-DataReader::QueuePair::QueuePair(int size) {
-  // Initialize the free queue with requested number of datums
-  for (int i = 0; i < size; ++i) {
-    free_.push(new Datum());
-  }
-}
-
-DataReader::QueuePair::~QueuePair() {
-  Datum* datum;
-  while (free_.try_pop(&datum)) {
-    delete datum;
-  }
-  while (full_.try_pop(&datum)) {
-    delete datum;
-  }
-}
-
-//
-
-DataReader::Body::Body(const LayerParameter& param)
-    : param_(param),
-      new_queue_pairs_() {
-  StartInternalThread();
-}
-
-DataReader::Body::~Body() {
-  StopInternalThread();
-}
-
-void DataReader::Body::InternalThreadEntry() {
-  shared_ptr<db::DB> db(db::GetDB(param_.data_param().backend()));
-  db->Open(param_.data_param().source(), db::READ);
-  shared_ptr<db::Cursor> cursor(db->NewCursor());
-  vector<shared_ptr<QueuePair> > qps;
-  try {
-    int solver_count = param_.phase() == TRAIN ? Caffe::solver_count() : 1;
-
-    // To ensure deterministic runs, only start running once all solvers
-    // are ready. But solvers need to peek on one item during initialization,
-    // so read one item, then wait for the next solver.
-    for (int i = 0; i < solver_count; ++i) {
-      shared_ptr<QueuePair> qp(new_queue_pairs_.pop());
-      read_one(cursor.get(), qp.get());
-      qps.push_back(qp);
-    }
-    // Main loop
-    while (!must_stop()) {
-      for (int i = 0; i < solver_count; ++i) {
-        read_one(cursor.get(), qps[i].get());
-      }
-      // Check no additional readers have been created. This can happen if
-      // more than one net is trained at a time per process, whether single
-      // or multi solver. It might also happen if two data layers have same
-      // name and same source.
-      CHECK_EQ(new_queue_pairs_.size(), 0);
-    }
-  } catch (boost::thread_interrupted&) {
-    // Interrupted exception is expected on shutdown
-  }
-}
-
-void DataReader::Body::read_one(db::Cursor* cursor, QueuePair* qp) {
-  Datum* datum = qp->free_.pop();
-  // TODO deserialize in-place instead of copy?
-  datum->ParseFromString(cursor->value());
-  qp->full_.push(datum);
-
-  // go to the next iter
-  cursor->Next();
-  if (!cursor->valid()) {
-    DLOG(INFO) << "Restarting data prefetching from start.";
-    cursor->SeekToFirst();
-  }
-}
-
-}  // namespace caffe
diff --git a/src/caffe/data_transformer.cpp b/src/caffe/data_transformer.cpp
deleted file mode 100644
index 7189d67..0000000
--- a/src/caffe/data_transformer.cpp
+++ /dev/null
@@ -1,545 +0,0 @@
-#ifdef USE_OPENCV
-#include <opencv2/core/core.hpp>
-#endif  // USE_OPENCV
-
-#include <string>
-#include <vector>
-
-#include "caffe/data_transformer.hpp"
-#include "caffe/util/io.hpp"
-#include "caffe/util/math_functions.hpp"
-#include "caffe/util/rng.hpp"
-
-namespace caffe {
-
-template<typename Dtype>
-DataTransformer<Dtype>::DataTransformer(const TransformationParameter& param,
-    Phase phase)
-    : param_(param), phase_(phase) {
-  // check if we want to use mean_file
-  if (param_.has_mean_file()) {
-    CHECK_EQ(param_.mean_value_size(), 0) <<
-      "Cannot specify mean_file and mean_value at the same time";
-    const string& mean_file = param.mean_file();
-    if (Caffe::root_solver()) {
-      LOG(INFO) << "Loading mean file from: " << mean_file;
-    }
-    BlobProto blob_proto;
-    ReadProtoFromBinaryFileOrDie(mean_file.c_str(), &blob_proto);
-    data_mean_.FromProto(blob_proto);
-  }
-  // check if we want to use mean_value
-  if (param_.mean_value_size() > 0) {
-    CHECK(param_.has_mean_file() == false) <<
-      "Cannot specify mean_file and mean_value at the same time";
-    for (int c = 0; c < param_.mean_value_size(); ++c) {
-      mean_values_.push_back(param_.mean_value(c));
-    }
-  }
-}
-
-template<typename Dtype>
-void DataTransformer<Dtype>::Transform(const Datum& datum,
-                                       Dtype* transformed_data) {
-  const string& data = datum.data();
-  const int datum_channels = datum.channels();
-  const int datum_height = datum.height();
-  const int datum_width = datum.width();
-
-  const int crop_size = param_.crop_size();
-  const Dtype scale = param_.scale();
-  const bool do_mirror = param_.mirror() && Rand(2);
-  const bool has_mean_file = param_.has_mean_file();
-  const bool has_uint8 = data.size() > 0;
-  const bool has_mean_values = mean_values_.size() > 0;
-
-  CHECK_GT(datum_channels, 0);
-  CHECK_GE(datum_height, crop_size);
-  CHECK_GE(datum_width, crop_size);
-
-  Dtype* mean = NULL;
-  if (has_mean_file) {
-    CHECK_EQ(datum_channels, data_mean_.channels());
-    CHECK_EQ(datum_height, data_mean_.height());
-    CHECK_EQ(datum_width, data_mean_.width());
-    mean = data_mean_.mutable_cpu_data();
-  }
-  if (has_mean_values) {
-    CHECK(mean_values_.size() == 1 || mean_values_.size() == datum_channels) <<
-     "Specify either 1 mean_value or as many as channels: " << datum_channels;
-    if (datum_channels > 1 && mean_values_.size() == 1) {
-      // Replicate the mean_value for simplicity
-      for (int c = 1; c < datum_channels; ++c) {
-        mean_values_.push_back(mean_values_[0]);
-      }
-    }
-  }
-
-  int height = datum_height;
-  int width = datum_width;
-
-  int h_off = 0;
-  int w_off = 0;
-  if (crop_size) {
-    height = crop_size;
-    width = crop_size;
-    // We only do random crop when we do training.
-    if (phase_ == TRAIN) {
-      h_off = Rand(datum_height - crop_size + 1);
-      w_off = Rand(datum_width - crop_size + 1);
-    } else {
-      h_off = (datum_height - crop_size) / 2;
-      w_off = (datum_width - crop_size) / 2;
-    }
-  }
-
-  Dtype datum_element;
-  int top_index, data_index;
-  for (int c = 0; c < datum_channels; ++c) {
-    for (int h = 0; h < height; ++h) {
-      for (int w = 0; w < width; ++w) {
-        data_index = (c * datum_height + h_off + h) * datum_width + w_off + w;
-        if (do_mirror) {
-          top_index = (c * height + h) * width + (width - 1 - w);
-        } else {
-          top_index = (c * height + h) * width + w;
-        }
-        if (has_uint8) {
-          datum_element =
-            static_cast<Dtype>(static_cast<uint8_t>(data[data_index]));
-        } else {
-          datum_element = datum.float_data(data_index);
-        }
-        if (has_mean_file) {
-          transformed_data[top_index] =
-            (datum_element - mean[data_index]) * scale;
-        } else {
-          if (has_mean_values) {
-            transformed_data[top_index] =
-              (datum_element - mean_values_[c]) * scale;
-          } else {
-            transformed_data[top_index] = datum_element * scale;
-          }
-        }
-      }
-    }
-  }
-}
-
-
-template<typename Dtype>
-void DataTransformer<Dtype>::Transform(const Datum& datum,
-                                       Blob<Dtype>* transformed_blob) {
-  // If datum is encoded, decoded and transform the cv::image.
-  if (datum.encoded()) {
-#ifdef USE_OPENCV
-    CHECK(!(param_.force_color() && param_.force_gray()))
-        << "cannot set both force_color and force_gray";
-    cv::Mat cv_img;
-    if (param_.force_color() || param_.force_gray()) {
-    // If force_color then decode in color otherwise decode in gray.
-      cv_img = DecodeDatumToCVMat(datum, param_.force_color());
-    } else {
-      cv_img = DecodeDatumToCVMatNative(datum);
-    }
-    // Transform the cv::image into blob.
-    return Transform(cv_img, transformed_blob);
-#else
-    LOG(FATAL) << "Encoded datum requires OpenCV; compile with USE_OPENCV.";
-#endif  // USE_OPENCV
-  } else {
-    if (param_.force_color() || param_.force_gray()) {
-      LOG(ERROR) << "force_color and force_gray only for encoded datum";
-    }
-  }
-
-  const int crop_size = param_.crop_size();
-  const int datum_channels = datum.channels();
-  const int datum_height = datum.height();
-  const int datum_width = datum.width();
-
-  // Check dimensions.
-  const int channels = transformed_blob->channels();
-  const int height = transformed_blob->height();
-  const int width = transformed_blob->width();
-  const int num = transformed_blob->num();
-
-  CHECK_EQ(channels, datum_channels);
-  CHECK_LE(height, datum_height);
-  CHECK_LE(width, datum_width);
-  CHECK_GE(num, 1);
-
-  if (crop_size) {
-    CHECK_EQ(crop_size, height);
-    CHECK_EQ(crop_size, width);
-  } else {
-    CHECK_EQ(datum_height, height);
-    CHECK_EQ(datum_width, width);
-  }
-
-  Dtype* transformed_data = transformed_blob->mutable_cpu_data();
-  Transform(datum, transformed_data);
-}
-
-template<typename Dtype>
-void DataTransformer<Dtype>::Transform(const vector<Datum> & datum_vector,
-                                       Blob<Dtype>* transformed_blob) {
-  const int datum_num = datum_vector.size();
-  const int num = transformed_blob->num();
-  const int channels = transformed_blob->channels();
-  const int height = transformed_blob->height();
-  const int width = transformed_blob->width();
-
-  CHECK_GT(datum_num, 0) << "There is no datum to add";
-  CHECK_LE(datum_num, num) <<
-    "The size of datum_vector must be no greater than transformed_blob->num()";
-  Blob<Dtype> uni_blob(1, channels, height, width);
-  for (int item_id = 0; item_id < datum_num; ++item_id) {
-    int offset = transformed_blob->offset(item_id);
-    uni_blob.set_cpu_data(transformed_blob->mutable_cpu_data() + offset);
-    Transform(datum_vector[item_id], &uni_blob);
-  }
-}
-
-#ifdef USE_OPENCV
-template<typename Dtype>
-void DataTransformer<Dtype>::Transform(const vector<cv::Mat> & mat_vector,
-                                       Blob<Dtype>* transformed_blob) {
-  const int mat_num = mat_vector.size();
-  const int num = transformed_blob->num();
-  const int channels = transformed_blob->channels();
-  const int height = transformed_blob->height();
-  const int width = transformed_blob->width();
-
-  CHECK_GT(mat_num, 0) << "There is no MAT to add";
-  CHECK_EQ(mat_num, num) <<
-    "The size of mat_vector must be equals to transformed_blob->num()";
-  Blob<Dtype> uni_blob(1, channels, height, width);
-  for (int item_id = 0; item_id < mat_num; ++item_id) {
-    int offset = transformed_blob->offset(item_id);
-    uni_blob.set_cpu_data(transformed_blob->mutable_cpu_data() + offset);
-    Transform(mat_vector[item_id], &uni_blob);
-  }
-}
-
-template<typename Dtype>
-void DataTransformer<Dtype>::Transform(const cv::Mat& cv_img,
-                                       Blob<Dtype>* transformed_blob) {
-  const int crop_size = param_.crop_size();
-  const int img_channels = cv_img.channels();
-  const int img_height = cv_img.rows;
-  const int img_width = cv_img.cols;
-
-  // Check dimensions.
-  const int channels = transformed_blob->channels();
-  const int height = transformed_blob->height();
-  const int width = transformed_blob->width();
-  const int num = transformed_blob->num();
-
-  CHECK_EQ(channels, img_channels);
-  CHECK_LE(height, img_height);
-  CHECK_LE(width, img_width);
-  CHECK_GE(num, 1);
-
-  CHECK(cv_img.depth() == CV_8U) << "Image data type must be unsigned byte";
-
-  const Dtype scale = param_.scale();
-  const bool do_mirror = param_.mirror() && Rand(2);
-  const bool has_mean_file = param_.has_mean_file();
-  const bool has_mean_values = mean_values_.size() > 0;
-
-  CHECK_GT(img_channels, 0);
-  CHECK_GE(img_height, crop_size);
-  CHECK_GE(img_width, crop_size);
-
-  Dtype* mean = NULL;
-  if (has_mean_file) {
-    CHECK_EQ(img_channels, data_mean_.channels());
-    CHECK_EQ(img_height, data_mean_.height());
-    CHECK_EQ(img_width, data_mean_.width());
-    mean = data_mean_.mutable_cpu_data();
-  }
-  if (has_mean_values) {
-    CHECK(mean_values_.size() == 1 || mean_values_.size() == img_channels) <<
-     "Specify either 1 mean_value or as many as channels: " << img_channels;
-    if (img_channels > 1 && mean_values_.size() == 1) {
-      // Replicate the mean_value for simplicity
-      for (int c = 1; c < img_channels; ++c) {
-        mean_values_.push_back(mean_values_[0]);
-      }
-    }
-  }
-
-  int h_off = 0;
-  int w_off = 0;
-  cv::Mat cv_cropped_img = cv_img;
-  if (crop_size) {
-    CHECK_EQ(crop_size, height);
-    CHECK_EQ(crop_size, width);
-    // We only do random crop when we do training.
-    if (phase_ == TRAIN) {
-      h_off = Rand(img_height - crop_size + 1);
-      w_off = Rand(img_width - crop_size + 1);
-    } else {
-      h_off = (img_height - crop_size) / 2;
-      w_off = (img_width - crop_size) / 2;
-    }
-    cv::Rect roi(w_off, h_off, crop_size, crop_size);
-    cv_cropped_img = cv_img(roi);
-  } else {
-    CHECK_EQ(img_height, height);
-    CHECK_EQ(img_width, width);
-  }
-
-  CHECK(cv_cropped_img.data);
-
-  Dtype* transformed_data = transformed_blob->mutable_cpu_data();
-  int top_index;
-  for (int h = 0; h < height; ++h) {
-    const uchar* ptr = cv_cropped_img.ptr<uchar>(h);
-    int img_index = 0;
-    for (int w = 0; w < width; ++w) {
-      for (int c = 0; c < img_channels; ++c) {
-        if (do_mirror) {
-          top_index = (c * height + h) * width + (width - 1 - w);
-        } else {
-          top_index = (c * height + h) * width + w;
-        }
-        // int top_index = (c * height + h) * width + w;
-        Dtype pixel = static_cast<Dtype>(ptr[img_index++]);
-        if (has_mean_file) {
-          int mean_index = (c * img_height + h_off + h) * img_width + w_off + w;
-          transformed_data[top_index] =
-            (pixel - mean[mean_index]) * scale;
-        } else {
-          if (has_mean_values) {
-            transformed_data[top_index] =
-              (pixel - mean_values_[c]) * scale;
-          } else {
-            transformed_data[top_index] = pixel * scale;
-          }
-        }
-      }
-    }
-  }
-}
-#endif  // USE_OPENCV
-
-template<typename Dtype>
-void DataTransformer<Dtype>::Transform(Blob<Dtype>* input_blob,
-                                       Blob<Dtype>* transformed_blob) {
-  const int crop_size = param_.crop_size();
-  const int input_num = input_blob->num();
-  const int input_channels = input_blob->channels();
-  const int input_height = input_blob->height();
-  const int input_width = input_blob->width();
-
-  if (transformed_blob->count() == 0) {
-    // Initialize transformed_blob with the right shape.
-    if (crop_size) {
-      transformed_blob->Reshape(input_num, input_channels,
-                                crop_size, crop_size);
-    } else {
-      transformed_blob->Reshape(input_num, input_channels,
-                                input_height, input_width);
-    }
-  }
-
-  const int num = transformed_blob->num();
-  const int channels = transformed_blob->channels();
-  const int height = transformed_blob->height();
-  const int width = transformed_blob->width();
-  const int size = transformed_blob->count();
-
-  CHECK_LE(input_num, num);
-  CHECK_EQ(input_channels, channels);
-  CHECK_GE(input_height, height);
-  CHECK_GE(input_width, width);
-
-
-  const Dtype scale = param_.scale();
-  const bool do_mirror = param_.mirror() && Rand(2);
-  const bool has_mean_file = param_.has_mean_file();
-  const bool has_mean_values = mean_values_.size() > 0;
-
-  int h_off = 0;
-  int w_off = 0;
-  if (crop_size) {
-    CHECK_EQ(crop_size, height);
-    CHECK_EQ(crop_size, width);
-    // We only do random crop when we do training.
-    if (phase_ == TRAIN) {
-      h_off = Rand(input_height - crop_size + 1);
-      w_off = Rand(input_width - crop_size + 1);
-    } else {
-      h_off = (input_height - crop_size) / 2;
-      w_off = (input_width - crop_size) / 2;
-    }
-  } else {
-    CHECK_EQ(input_height, height);
-    CHECK_EQ(input_width, width);
-  }
-
-  Dtype* input_data = input_blob->mutable_cpu_data();
-  if (has_mean_file) {
-    CHECK_EQ(input_channels, data_mean_.channels());
-    CHECK_EQ(input_height, data_mean_.height());
-    CHECK_EQ(input_width, data_mean_.width());
-    for (int n = 0; n < input_num; ++n) {
-      int offset = input_blob->offset(n);
-      caffe_sub(data_mean_.count(), input_data + offset,
-            data_mean_.cpu_data(), input_data + offset);
-    }
-  }
-
-  if (has_mean_values) {
-    CHECK(mean_values_.size() == 1 || mean_values_.size() == input_channels) <<
-     "Specify either 1 mean_value or as many as channels: " << input_channels;
-    if (mean_values_.size() == 1) {
-      caffe_add_scalar(input_blob->count(), -(mean_values_[0]), input_data);
-    } else {
-      for (int n = 0; n < input_num; ++n) {
-        for (int c = 0; c < input_channels; ++c) {
-          int offset = input_blob->offset(n, c);
-          caffe_add_scalar(input_height * input_width, -(mean_values_[c]),
-            input_data + offset);
-        }
-      }
-    }
-  }
-
-  Dtype* transformed_data = transformed_blob->mutable_cpu_data();
-
-  for (int n = 0; n < input_num; ++n) {
-    int top_index_n = n * channels;
-    int data_index_n = n * channels;
-    for (int c = 0; c < channels; ++c) {
-      int top_index_c = (top_index_n + c) * height;
-      int data_index_c = (data_index_n + c) * input_height + h_off;
-      for (int h = 0; h < height; ++h) {
-        int top_index_h = (top_index_c + h) * width;
-        int data_index_h = (data_index_c + h) * input_width + w_off;
-        if (do_mirror) {
-          int top_index_w = top_index_h + width - 1;
-          for (int w = 0; w < width; ++w) {
-            transformed_data[top_index_w-w] = input_data[data_index_h + w];
-          }
-        } else {
-          for (int w = 0; w < width; ++w) {
-            transformed_data[top_index_h + w] = input_data[data_index_h + w];
-          }
-        }
-      }
-    }
-  }
-  if (scale != Dtype(1)) {
-    DLOG(INFO) << "Scale: " << scale;
-    caffe_scal(size, scale, transformed_data);
-  }
-}
-
-template<typename Dtype>
-vector<int> DataTransformer<Dtype>::InferBlobShape(const Datum& datum) {
-  if (datum.encoded()) {
-#ifdef USE_OPENCV
-    CHECK(!(param_.force_color() && param_.force_gray()))
-        << "cannot set both force_color and force_gray";
-    cv::Mat cv_img;
-    if (param_.force_color() || param_.force_gray()) {
-    // If force_color then decode in color otherwise decode in gray.
-      cv_img = DecodeDatumToCVMat(datum, param_.force_color());
-    } else {
-      cv_img = DecodeDatumToCVMatNative(datum);
-    }
-    // InferBlobShape using the cv::image.
-    return InferBlobShape(cv_img);
-#else
-    LOG(FATAL) << "Encoded datum requires OpenCV; compile with USE_OPENCV.";
-#endif  // USE_OPENCV
-  }
-  const int crop_size = param_.crop_size();
-  const int datum_channels = datum.channels();
-  const int datum_height = datum.height();
-  const int datum_width = datum.width();
-  // Check dimensions.
-  CHECK_GT(datum_channels, 0);
-  CHECK_GE(datum_height, crop_size);
-  CHECK_GE(datum_width, crop_size);
-  // Build BlobShape.
-  vector<int> shape(4);
-  shape[0] = 1;
-  shape[1] = datum_channels;
-  shape[2] = (crop_size)? crop_size: datum_height;
-  shape[3] = (crop_size)? crop_size: datum_width;
-  return shape;
-}
-
-template<typename Dtype>
-vector<int> DataTransformer<Dtype>::InferBlobShape(
-    const vector<Datum> & datum_vector) {
-  const int num = datum_vector.size();
-  CHECK_GT(num, 0) << "There is no datum to in the vector";
-  // Use first datum in the vector to InferBlobShape.
-  vector<int> shape = InferBlobShape(datum_vector[0]);
-  // Adjust num to the size of the vector.
-  shape[0] = num;
-  return shape;
-}
-
-#ifdef USE_OPENCV
-template<typename Dtype>
-vector<int> DataTransformer<Dtype>::InferBlobShape(const cv::Mat& cv_img) {
-  const int crop_size = param_.crop_size();
-  const int img_channels = cv_img.channels();
-  const int img_height = cv_img.rows;
-  const int img_width = cv_img.cols;
-  // Check dimensions.
-  CHECK_GT(img_channels, 0);
-  CHECK_GE(img_height, crop_size);
-  CHECK_GE(img_width, crop_size);
-  // Build BlobShape.
-  vector<int> shape(4);
-  shape[0] = 1;
-  shape[1] = img_channels;
-  shape[2] = (crop_size)? crop_size: img_height;
-  shape[3] = (crop_size)? crop_size: img_width;
-  return shape;
-}
-
-template<typename Dtype>
-vector<int> DataTransformer<Dtype>::InferBlobShape(
-    const vector<cv::Mat> & mat_vector) {
-  const int num = mat_vector.size();
-  CHECK_GT(num, 0) << "There is no cv_img to in the vector";
-  // Use first cv_img in the vector to InferBlobShape.
-  vector<int> shape = InferBlobShape(mat_vector[0]);
-  // Adjust num to the size of the vector.
-  shape[0] = num;
-  return shape;
-}
-#endif  // USE_OPENCV
-
-template <typename Dtype>
-void DataTransformer<Dtype>::InitRand() {
-  const bool needs_rand = param_.mirror() ||
-      (phase_ == TRAIN && param_.crop_size());
-  if (needs_rand) {
-    const unsigned int rng_seed = caffe_rng_rand();
-    rng_.reset(new Caffe::RNG(rng_seed));
-  } else {
-    rng_.reset();
-  }
-}
-
-template <typename Dtype>
-int DataTransformer<Dtype>::Rand(int n) {
-  CHECK(rng_);
-  CHECK_GT(n, 0);
-  caffe::rng_t* rng =
-      static_cast<caffe::rng_t*>(rng_->generator());
-  return ((*rng)() % n);
-}
-
-INSTANTIATE_CLASS(DataTransformer);
-
-}  // namespace caffe
diff --git a/src/caffe/internal_thread.cpp b/src/caffe/internal_thread.cpp
deleted file mode 100644
index 104884e..0000000
--- a/src/caffe/internal_thread.cpp
+++ /dev/null
@@ -1,66 +0,0 @@
-#include <boost/thread.hpp>
-#include <exception>
-
-#include "caffe/internal_thread.hpp"
-#include "caffe/util/math_functions.hpp"
-
-namespace caffe {
-
-InternalThread::~InternalThread() {
-  StopInternalThread();
-}
-
-bool InternalThread::is_started() const {
-  return thread_ && thread_->joinable();
-}
-
-bool InternalThread::must_stop() {
-  return thread_ && thread_->interruption_requested();
-}
-
-void InternalThread::StartInternalThread() {
-  CHECK(!is_started()) << "Threads should persist and not be restarted.";
-
-  int device = 0;
-#ifndef CPU_ONLY
-  CUDA_CHECK(cudaGetDevice(&device));
-#endif
-  Caffe::Brew mode = Caffe::mode();
-  int rand_seed = caffe_rng_rand();
-  int solver_count = Caffe::solver_count();
-  bool root_solver = Caffe::root_solver();
-
-  try {
-    thread_.reset(new boost::thread(&InternalThread::entry, this, device, mode,
-          rand_seed, solver_count, root_solver));
-  } catch (std::exception& e) {
-    LOG(FATAL) << "Thread exception: " << e.what();
-  }
-}
-
-void InternalThread::entry(int device, Caffe::Brew mode, int rand_seed,
-    int solver_count, bool root_solver) {
-#ifndef CPU_ONLY
-  CUDA_CHECK(cudaSetDevice(device));
-#endif
-  Caffe::set_mode(mode);
-  Caffe::set_random_seed(rand_seed);
-  Caffe::set_solver_count(solver_count);
-  Caffe::set_root_solver(root_solver);
-
-  InternalThreadEntry();
-}
-
-void InternalThread::StopInternalThread() {
-  if (is_started()) {
-    thread_->interrupt();
-    try {
-      thread_->join();
-    } catch (boost::thread_interrupted&) {
-    } catch (std::exception& e) {
-      LOG(FATAL) << "Thread exception: " << e.what();
-    }
-  }
-}
-
-}  // namespace caffe
diff --git a/src/caffe/layer.cpp b/src/caffe/layer.cpp
index 3b91289..684ae88 100644
--- a/src/caffe/layer.cpp
+++ b/src/caffe/layer.cpp
@@ -1,27 +1,7 @@
-#include <boost/thread.hpp>
 #include "caffe/layer.hpp"
 
 namespace caffe {
 
-template <typename Dtype>
-void Layer<Dtype>::InitMutex() {
-  forward_mutex_.reset(new boost::mutex());
-}
-
-template <typename Dtype>
-void Layer<Dtype>::Lock() {
-  if (IsShared()) {
-    forward_mutex_->lock();
-  }
-}
-
-template <typename Dtype>
-void Layer<Dtype>::Unlock() {
-  if (IsShared()) {
-    forward_mutex_->unlock();
-  }
-}
-
 INSTANTIATE_CLASS(Layer);
 
 }  // namespace caffe
diff --git a/src/caffe/layer_factory.cpp b/src/caffe/layer_factory.cpp
index e967bd6..0c79b07 100644
--- a/src/caffe/layer_factory.cpp
+++ b/src/caffe/layer_factory.cpp
@@ -1,8 +1,3 @@
-// Make sure we include Python.h before any system header
-// to avoid _POSIX_C_SOURCE redefinition
-#ifdef WITH_PYTHON_LAYER
-#include <boost/python.hpp>
-#endif
 #include <string>
 
 #include "caffe/layer.hpp"
@@ -16,21 +11,6 @@
 #include "caffe/layers/tanh_layer.hpp"
 #include "caffe/proto/caffe.pb.h"
 
-#ifdef USE_CUDNN
-#include "caffe/layers/cudnn_conv_layer.hpp"
-#include "caffe/layers/cudnn_lcn_layer.hpp"
-#include "caffe/layers/cudnn_lrn_layer.hpp"
-#include "caffe/layers/cudnn_pooling_layer.hpp"
-#include "caffe/layers/cudnn_relu_layer.hpp"
-#include "caffe/layers/cudnn_sigmoid_layer.hpp"
-#include "caffe/layers/cudnn_softmax_layer.hpp"
-#include "caffe/layers/cudnn_tanh_layer.hpp"
-#endif
-
-#ifdef WITH_PYTHON_LAYER
-#include "caffe/layers/python_layer.hpp"
-#endif
-
 namespace caffe {
 
 // Get convolution layer according to engine.
@@ -39,32 +19,11 @@ shared_ptr<Layer<Dtype> > GetConvolutionLayer(
     const LayerParameter& param) {
   ConvolutionParameter conv_param = param.convolution_param();
   ConvolutionParameter_Engine engine = conv_param.engine();
-#ifdef USE_CUDNN
-  bool use_dilation = false;
-  for (int i = 0; i < conv_param.dilation_size(); ++i) {
-    if (conv_param.dilation(i) > 1) {
-      use_dilation = true;
-    }
-  }
-#endif
   if (engine == ConvolutionParameter_Engine_DEFAULT) {
     engine = ConvolutionParameter_Engine_CAFFE;
-#ifdef USE_CUDNN
-    if (!use_dilation) {
-      engine = ConvolutionParameter_Engine_CUDNN;
-    }
-#endif
   }
   if (engine == ConvolutionParameter_Engine_CAFFE) {
     return shared_ptr<Layer<Dtype> >(new ConvolutionLayer<Dtype>(param));
-#ifdef USE_CUDNN
-  } else if (engine == ConvolutionParameter_Engine_CUDNN) {
-    if (use_dilation) {
-      LOG(FATAL) << "CuDNN doesn't support the dilated convolution at Layer "
-                 << param.name();
-    }
-    return shared_ptr<Layer<Dtype> >(new CuDNNConvolutionLayer<Dtype>(param));
-#endif
   } else {
     LOG(FATAL) << "Layer " << param.name() << " has unknown engine.";
   }
@@ -78,30 +37,9 @@ shared_ptr<Layer<Dtype> > GetPoolingLayer(const LayerParameter& param) {
   PoolingParameter_Engine engine = param.pooling_param().engine();
   if (engine == PoolingParameter_Engine_DEFAULT) {
     engine = PoolingParameter_Engine_CAFFE;
-#ifdef USE_CUDNN
-    engine = PoolingParameter_Engine_CUDNN;
-#endif
   }
   if (engine == PoolingParameter_Engine_CAFFE) {
     return shared_ptr<Layer<Dtype> >(new PoolingLayer<Dtype>(param));
-#ifdef USE_CUDNN
-  } else if (engine == PoolingParameter_Engine_CUDNN) {
-    if (param.top_size() > 1) {
-      LOG(INFO) << "cuDNN does not support multiple tops. "
-                << "Using Caffe's own pooling layer.";
-      return shared_ptr<Layer<Dtype> >(new PoolingLayer<Dtype>(param));
-    }
-    // CuDNN assumes layers are not being modified in place, thus
-    // breaking our index tracking for updates in some cases in Caffe.
-    // Until there is a workaround in Caffe (index management) or
-    // cuDNN, use Caffe layer to max pooling, or don't use in place
-    // layers after max pooling layers
-    if (param.pooling_param().pool() == PoolingParameter_PoolMethod_MAX) {
-        return shared_ptr<Layer<Dtype> >(new PoolingLayer<Dtype>(param));
-    } else {
-        return shared_ptr<Layer<Dtype> >(new CuDNNPoolingLayer<Dtype>(param));
-    }
-#endif
   } else {
     LOG(FATAL) << "Layer " << param.name() << " has unknown engine.";
   }
@@ -115,30 +53,11 @@ shared_ptr<Layer<Dtype> > GetLRNLayer(const LayerParameter& param) {
   LRNParameter_Engine engine = param.lrn_param().engine();
 
   if (engine == LRNParameter_Engine_DEFAULT) {
-#ifdef USE_CUDNN
-    engine = LRNParameter_Engine_CUDNN;
-#else
     engine = LRNParameter_Engine_CAFFE;
-#endif
   }
 
   if (engine == LRNParameter_Engine_CAFFE) {
     return shared_ptr<Layer<Dtype> >(new LRNLayer<Dtype>(param));
-#ifdef USE_CUDNN
-  } else if (engine == LRNParameter_Engine_CUDNN) {
-    LRNParameter lrn_param = param.lrn_param();
-
-    if (lrn_param.norm_region() ==LRNParameter_NormRegion_WITHIN_CHANNEL) {
-      return shared_ptr<Layer<Dtype> >(new CuDNNLCNLayer<Dtype>(param));
-    } else {
-      // local size is too big to be handled through cuDNN
-      if (param.lrn_param().local_size() > CUDNN_LRN_MAX_N) {
-        return shared_ptr<Layer<Dtype> >(new LRNLayer<Dtype>(param));
-      } else {
-        return shared_ptr<Layer<Dtype> >(new CuDNNLRNLayer<Dtype>(param));
-      }
-    }
-#endif
   } else {
     LOG(FATAL) << "Layer " << param.name() << " has unknown engine.";
   }
@@ -152,16 +71,9 @@ shared_ptr<Layer<Dtype> > GetReLULayer(const LayerParameter& param) {
   ReLUParameter_Engine engine = param.relu_param().engine();
   if (engine == ReLUParameter_Engine_DEFAULT) {
     engine = ReLUParameter_Engine_CAFFE;
-#ifdef USE_CUDNN
-    engine = ReLUParameter_Engine_CUDNN;
-#endif
   }
   if (engine == ReLUParameter_Engine_CAFFE) {
     return shared_ptr<Layer<Dtype> >(new ReLULayer<Dtype>(param));
-#ifdef USE_CUDNN
-  } else if (engine == ReLUParameter_Engine_CUDNN) {
-    return shared_ptr<Layer<Dtype> >(new CuDNNReLULayer<Dtype>(param));
-#endif
   } else {
     LOG(FATAL) << "Layer " << param.name() << " has unknown engine.";
   }
@@ -175,16 +87,9 @@ shared_ptr<Layer<Dtype> > GetSigmoidLayer(const LayerParameter& param) {
   SigmoidParameter_Engine engine = param.sigmoid_param().engine();
   if (engine == SigmoidParameter_Engine_DEFAULT) {
     engine = SigmoidParameter_Engine_CAFFE;
-#ifdef USE_CUDNN
-    engine = SigmoidParameter_Engine_CUDNN;
-#endif
   }
   if (engine == SigmoidParameter_Engine_CAFFE) {
     return shared_ptr<Layer<Dtype> >(new SigmoidLayer<Dtype>(param));
-#ifdef USE_CUDNN
-  } else if (engine == SigmoidParameter_Engine_CUDNN) {
-    return shared_ptr<Layer<Dtype> >(new CuDNNSigmoidLayer<Dtype>(param));
-#endif
   } else {
     LOG(FATAL) << "Layer " << param.name() << " has unknown engine.";
   }
@@ -198,16 +103,9 @@ shared_ptr<Layer<Dtype> > GetSoftmaxLayer(const LayerParameter& param) {
   SoftmaxParameter_Engine engine = param.softmax_param().engine();
   if (engine == SoftmaxParameter_Engine_DEFAULT) {
     engine = SoftmaxParameter_Engine_CAFFE;
-#ifdef USE_CUDNN
-    engine = SoftmaxParameter_Engine_CUDNN;
-#endif
   }
   if (engine == SoftmaxParameter_Engine_CAFFE) {
     return shared_ptr<Layer<Dtype> >(new SoftmaxLayer<Dtype>(param));
-#ifdef USE_CUDNN
-  } else if (engine == SoftmaxParameter_Engine_CUDNN) {
-    return shared_ptr<Layer<Dtype> >(new CuDNNSoftmaxLayer<Dtype>(param));
-#endif
   } else {
     LOG(FATAL) << "Layer " << param.name() << " has unknown engine.";
   }
@@ -221,16 +119,9 @@ shared_ptr<Layer<Dtype> > GetTanHLayer(const LayerParameter& param) {
   TanHParameter_Engine engine = param.tanh_param().engine();
   if (engine == TanHParameter_Engine_DEFAULT) {
     engine = TanHParameter_Engine_CAFFE;
-#ifdef USE_CUDNN
-    engine = TanHParameter_Engine_CUDNN;
-#endif
   }
   if (engine == TanHParameter_Engine_CAFFE) {
     return shared_ptr<Layer<Dtype> >(new TanHLayer<Dtype>(param));
-#ifdef USE_CUDNN
-  } else if (engine == TanHParameter_Engine_CUDNN) {
-    return shared_ptr<Layer<Dtype> >(new CuDNNTanHLayer<Dtype>(param));
-#endif
   } else {
     LOG(FATAL) << "Layer " << param.name() << " has unknown engine.";
   }
@@ -238,23 +129,6 @@ shared_ptr<Layer<Dtype> > GetTanHLayer(const LayerParameter& param) {
 
 REGISTER_LAYER_CREATOR(TanH, GetTanHLayer);
 
-#ifdef WITH_PYTHON_LAYER
-template <typename Dtype>
-shared_ptr<Layer<Dtype> > GetPythonLayer(const LayerParameter& param) {
-  Py_Initialize();
-  try {
-    bp::object module = bp::import(param.python_param().module().c_str());
-    bp::object layer = module.attr(param.python_param().layer().c_str())(param);
-    return bp::extract<shared_ptr<PythonLayer<Dtype> > >(layer)();
-  } catch (bp::error_already_set) {
-    PyErr_Print();
-    throw;
-  }
-}
-
-REGISTER_LAYER_CREATOR(Python, GetPythonLayer);
-#endif
-
 // Layers that use their constructor as their default creator should be
 // registered in their corresponding cpp files. Do not register them here.
 }  // namespace caffe
diff --git a/src/caffe/layers/absval_layer.cpp b/src/caffe/layers/absval_layer.cpp
index 855bf0b..02ee959 100644
--- a/src/caffe/layers/absval_layer.cpp
+++ b/src/caffe/layers/absval_layer.cpp
@@ -34,10 +34,6 @@ void AbsValLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
-#ifdef CPU_ONLY
-STUB_GPU(AbsValLayer);
-#endif
-
 INSTANTIATE_CLASS(AbsValLayer);
 REGISTER_LAYER_CLASS(AbsVal);
 
diff --git a/src/caffe/layers/absval_layer.cu b/src/caffe/layers/absval_layer.cu
deleted file mode 100644
index 6c927e6..0000000
--- a/src/caffe/layers/absval_layer.cu
+++ /dev/null
@@ -1,32 +0,0 @@
-#include <vector>
-
-#include "caffe/layers/absval_layer.hpp"
-#include "caffe/util/math_functions.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void AbsValLayer<Dtype>::Forward_gpu(
-    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  const int count = top[0]->count();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  caffe_gpu_abs(count, bottom[0]->gpu_data(), top_data);
-}
-
-template <typename Dtype>
-void AbsValLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  const int count = top[0]->count();
-  const Dtype* top_diff = top[0]->gpu_diff();
-  if (propagate_down[0]) {
-    const Dtype* bottom_data = bottom[0]->gpu_data();
-    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-    caffe_gpu_sign(count, bottom_data, bottom_diff);
-    caffe_gpu_mul(count, bottom_diff, top_diff, bottom_diff);
-  }
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(AbsValLayer);
-
-
-}  // namespace caffe
diff --git a/src/caffe/layers/base_conv_layer.cpp b/src/caffe/layers/base_conv_layer.cpp
index 4a4c68e..af174be 100644
--- a/src/caffe/layers/base_conv_layer.cpp
+++ b/src/caffe/layers/base_conv_layer.cpp
@@ -320,77 +320,6 @@ void BaseConvolutionLayer<Dtype>::backward_cpu_bias(Dtype* bias,
       input, bias_multiplier_.cpu_data(), 1., bias);
 }
 
-#ifndef CPU_ONLY
-
-template <typename Dtype>
-void BaseConvolutionLayer<Dtype>::forward_gpu_gemm(const Dtype* input,
-    const Dtype* weights, Dtype* output, bool skip_im2col) {
-  const Dtype* col_buff = input;
-  if (!is_1x1_) {
-    if (!skip_im2col) {
-      conv_im2col_gpu(input, col_buffer_.mutable_gpu_data());
-    }
-    col_buff = col_buffer_.gpu_data();
-  }
-  for (int g = 0; g < group_; ++g) {
-    caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, conv_out_channels_ /
-        group_, conv_out_spatial_dim_, kernel_dim_,
-        (Dtype)1., weights + weight_offset_ * g, col_buff + col_offset_ * g,
-        (Dtype)0., output + output_offset_ * g);
-  }
-}
-
-template <typename Dtype>
-void BaseConvolutionLayer<Dtype>::forward_gpu_bias(Dtype* output,
-    const Dtype* bias) {
-  caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num_output_,
-      out_spatial_dim_, 1, (Dtype)1., bias, bias_multiplier_.gpu_data(),
-      (Dtype)1., output);
-}
-
-template <typename Dtype>
-void BaseConvolutionLayer<Dtype>::backward_gpu_gemm(const Dtype* output,
-    const Dtype* weights, Dtype* input) {
-  Dtype* col_buff = col_buffer_.mutable_gpu_data();
-  if (is_1x1_) {
-    col_buff = input;
-  }
-  for (int g = 0; g < group_; ++g) {
-    caffe_gpu_gemm<Dtype>(CblasTrans, CblasNoTrans, kernel_dim_,
-        conv_out_spatial_dim_, conv_out_channels_ / group_,
-        (Dtype)1., weights + weight_offset_ * g, output + output_offset_ * g,
-        (Dtype)0., col_buff + col_offset_ * g);
-  }
-  if (!is_1x1_) {
-    conv_col2im_gpu(col_buff, input);
-  }
-}
-
-template <typename Dtype>
-void BaseConvolutionLayer<Dtype>::weight_gpu_gemm(const Dtype* input,
-    const Dtype* output, Dtype* weights) {
-  const Dtype* col_buff = input;
-  if (!is_1x1_) {
-    conv_im2col_gpu(input, col_buffer_.mutable_gpu_data());
-    col_buff = col_buffer_.gpu_data();
-  }
-  for (int g = 0; g < group_; ++g) {
-    caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasTrans, conv_out_channels_ / group_,
-        kernel_dim_, conv_out_spatial_dim_,
-        (Dtype)1., output + output_offset_ * g, col_buff + col_offset_ * g,
-        (Dtype)1., weights + weight_offset_ * g);
-  }
-}
-
-template <typename Dtype>
-void BaseConvolutionLayer<Dtype>::backward_gpu_bias(Dtype* bias,
-    const Dtype* input) {
-  caffe_gpu_gemv<Dtype>(CblasNoTrans, num_output_, out_spatial_dim_, 1.,
-      input, bias_multiplier_.gpu_data(), 1., bias);
-}
-
-#endif  // !CPU_ONLY
-
 INSTANTIATE_CLASS(BaseConvolutionLayer);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/base_data_layer.cpp b/src/caffe/layers/base_data_layer.cpp
deleted file mode 100644
index 989319f..0000000
--- a/src/caffe/layers/base_data_layer.cpp
+++ /dev/null
@@ -1,134 +0,0 @@
-#include <boost/thread.hpp>
-#include <vector>
-
-#include "caffe/blob.hpp"
-#include "caffe/data_transformer.hpp"
-#include "caffe/internal_thread.hpp"
-#include "caffe/layer.hpp"
-#include "caffe/layers/base_data_layer.hpp"
-#include "caffe/proto/caffe.pb.h"
-#include "caffe/util/blocking_queue.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-BaseDataLayer<Dtype>::BaseDataLayer(const LayerParameter& param)
-    : Layer<Dtype>(param),
-      transform_param_(param.transform_param()) {
-}
-
-template <typename Dtype>
-void BaseDataLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  if (top.size() == 1) {
-    output_labels_ = false;
-  } else {
-    output_labels_ = true;
-  }
-  data_transformer_.reset(
-      new DataTransformer<Dtype>(transform_param_, this->phase_));
-  data_transformer_->InitRand();
-  // The subclasses should setup the size of bottom and top
-  DataLayerSetUp(bottom, top);
-}
-
-template <typename Dtype>
-BasePrefetchingDataLayer<Dtype>::BasePrefetchingDataLayer(
-    const LayerParameter& param)
-    : BaseDataLayer<Dtype>(param),
-      prefetch_free_(), prefetch_full_() {
-  for (int i = 0; i < PREFETCH_COUNT; ++i) {
-    prefetch_free_.push(&prefetch_[i]);
-  }
-}
-
-template <typename Dtype>
-void BasePrefetchingDataLayer<Dtype>::LayerSetUp(
-    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  BaseDataLayer<Dtype>::LayerSetUp(bottom, top);
-  // Before starting the prefetch thread, we make cpu_data and gpu_data
-  // calls so that the prefetch thread does not accidentally make simultaneous
-  // cudaMalloc calls when the main thread is running. In some GPUs this
-  // seems to cause failures if we do not so.
-  for (int i = 0; i < PREFETCH_COUNT; ++i) {
-    prefetch_[i].data_.mutable_cpu_data();
-    if (this->output_labels_) {
-      prefetch_[i].label_.mutable_cpu_data();
-    }
-  }
-#ifndef CPU_ONLY
-  if (Caffe::mode() == Caffe::GPU) {
-    for (int i = 0; i < PREFETCH_COUNT; ++i) {
-      prefetch_[i].data_.mutable_gpu_data();
-      if (this->output_labels_) {
-        prefetch_[i].label_.mutable_gpu_data();
-      }
-    }
-  }
-#endif
-  DLOG(INFO) << "Initializing prefetch";
-  this->data_transformer_->InitRand();
-  StartInternalThread();
-  DLOG(INFO) << "Prefetch initialized.";
-}
-
-template <typename Dtype>
-void BasePrefetchingDataLayer<Dtype>::InternalThreadEntry() {
-#ifndef CPU_ONLY
-  cudaStream_t stream;
-  if (Caffe::mode() == Caffe::GPU) {
-    CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
-  }
-#endif
-
-  try {
-    while (!must_stop()) {
-      Batch<Dtype>* batch = prefetch_free_.pop();
-      load_batch(batch);
-#ifndef CPU_ONLY
-      if (Caffe::mode() == Caffe::GPU) {
-        batch->data_.data().get()->async_gpu_push(stream);
-        CUDA_CHECK(cudaStreamSynchronize(stream));
-      }
-#endif
-      prefetch_full_.push(batch);
-    }
-  } catch (boost::thread_interrupted&) {
-    // Interrupted exception is expected on shutdown
-  }
-#ifndef CPU_ONLY
-  if (Caffe::mode() == Caffe::GPU) {
-    CUDA_CHECK(cudaStreamDestroy(stream));
-  }
-#endif
-}
-
-template <typename Dtype>
-void BasePrefetchingDataLayer<Dtype>::Forward_cpu(
-    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  Batch<Dtype>* batch = prefetch_full_.pop("Data layer prefetch queue empty");
-  // Reshape to loaded data.
-  top[0]->ReshapeLike(batch->data_);
-  // Copy the data
-  caffe_copy(batch->data_.count(), batch->data_.cpu_data(),
-             top[0]->mutable_cpu_data());
-  DLOG(INFO) << "Prefetch copied";
-  if (this->output_labels_) {
-    // Reshape to loaded labels.
-    top[1]->ReshapeLike(batch->label_);
-    // Copy the labels.
-    caffe_copy(batch->label_.count(), batch->label_.cpu_data(),
-        top[1]->mutable_cpu_data());
-  }
-
-  prefetch_free_.push(batch);
-}
-
-#ifdef CPU_ONLY
-STUB_GPU_FORWARD(BasePrefetchingDataLayer, Forward);
-#endif
-
-INSTANTIATE_CLASS(BaseDataLayer);
-INSTANTIATE_CLASS(BasePrefetchingDataLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/base_data_layer.cu b/src/caffe/layers/base_data_layer.cu
deleted file mode 100644
index 4056d36..0000000
--- a/src/caffe/layers/base_data_layer.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-#include <vector>
-
-#include "caffe/layers/base_data_layer.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void BasePrefetchingDataLayer<Dtype>::Forward_gpu(
-    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  Batch<Dtype>* batch = prefetch_full_.pop("Data layer prefetch queue empty");
-  // Reshape to loaded data.
-  top[0]->ReshapeLike(batch->data_);
-  // Copy the data
-  caffe_copy(batch->data_.count(), batch->data_.gpu_data(),
-      top[0]->mutable_gpu_data());
-  if (this->output_labels_) {
-    // Reshape to loaded labels.
-    top[1]->ReshapeLike(batch->label_);
-    // Copy the labels.
-    caffe_copy(batch->label_.count(), batch->label_.gpu_data(),
-        top[1]->mutable_gpu_data());
-  }
-  // Ensure the copy is synchronous wrt the host, so that the next batch isn't
-  // copied in meanwhile.
-  CUDA_CHECK(cudaStreamSynchronize(cudaStreamDefault));
-  prefetch_free_.push(batch);
-}
-
-INSTANTIATE_LAYER_GPU_FORWARD(BasePrefetchingDataLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/batch_norm_layer.cpp b/src/caffe/layers/batch_norm_layer.cpp
index a69d8f9..65b52aa 100644
--- a/src/caffe/layers/batch_norm_layer.cpp
+++ b/src/caffe/layers/batch_norm_layer.cpp
@@ -229,11 +229,6 @@ void BatchNormLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   caffe_div(temp_.count(), bottom_diff, temp_.cpu_data(), bottom_diff);
 }
 
-
-#ifdef CPU_ONLY
-STUB_GPU(BatchNormLayer);
-#endif
-
 INSTANTIATE_CLASS(BatchNormLayer);
 REGISTER_LAYER_CLASS(BatchNorm);
 }  // namespace caffe
diff --git a/src/caffe/layers/batch_norm_layer.cu b/src/caffe/layers/batch_norm_layer.cu
deleted file mode 100644
index c21713c..0000000
--- a/src/caffe/layers/batch_norm_layer.cu
+++ /dev/null
@@ -1,171 +0,0 @@
-#include <algorithm>
-#include <vector>
-
-#include "caffe/layers/batch_norm_layer.hpp"
-#include "caffe/util/math_functions.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void BatchNormLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  int num = bottom[0]->shape(0);
-  int spatial_dim = bottom[0]->count()/(channels_*bottom[0]->shape(0));
-
-  if (bottom[0] != top[0]) {
-    caffe_copy(bottom[0]->count(), bottom_data, top_data);
-  }
-
-
-  if (use_global_stats_) {
-    // use the stored mean/variance estimates.
-    const Dtype scale_factor = this->blobs_[2]->cpu_data()[0] == 0 ?
-        0 : 1 / this->blobs_[2]->cpu_data()[0];
-    caffe_gpu_scale(variance_.count(), scale_factor,
-        this->blobs_[0]->gpu_data(), mean_.mutable_gpu_data());
-    caffe_gpu_scale(variance_.count(), scale_factor,
-        this->blobs_[1]->gpu_data(), variance_.mutable_gpu_data());
-  } else {
-    // compute mean
-    caffe_gpu_gemv<Dtype>(CblasNoTrans, channels_ * num, spatial_dim,
-        1. / (num * spatial_dim), bottom_data,
-        spatial_sum_multiplier_.gpu_data(), 0.,
-        num_by_chans_.mutable_gpu_data());
-    caffe_gpu_gemv<Dtype>(CblasTrans, num, channels_, 1.,
-        num_by_chans_.gpu_data(), batch_sum_multiplier_.gpu_data(), 0.,
-        mean_.mutable_gpu_data());
-  }
-
-  // subtract mean
-  caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, channels_, 1, 1,
-      batch_sum_multiplier_.gpu_data(), mean_.gpu_data(), 0.,
-      num_by_chans_.mutable_gpu_data());
-  caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, channels_ * num,
-      spatial_dim, 1, -1, num_by_chans_.gpu_data(),
-      spatial_sum_multiplier_.gpu_data(), 1., top_data);
-
-  if (!use_global_stats_) {
-    // compute variance using var(X) = E((X-EX)^2)
-    caffe_gpu_powx(top[0]->count(), top_data, Dtype(2),
-        temp_.mutable_gpu_data());  // (X-EX)^2
-    caffe_gpu_gemv<Dtype>(CblasNoTrans, channels_ * num, spatial_dim,
-        1. / (num * spatial_dim), temp_.gpu_data(),
-        spatial_sum_multiplier_.gpu_data(), 0.,
-        num_by_chans_.mutable_gpu_data());
-    caffe_gpu_gemv<Dtype>(CblasTrans, num, channels_, 1.,
-        num_by_chans_.gpu_data(), batch_sum_multiplier_.gpu_data(), 0.,
-        variance_.mutable_gpu_data());  // E((X_EX)^2)
-
-    // compute and save moving average
-    this->blobs_[2]->mutable_cpu_data()[0] *= moving_average_fraction_;
-    this->blobs_[2]->mutable_cpu_data()[0] += 1;
-    caffe_gpu_axpby(mean_.count(), Dtype(1), mean_.gpu_data(),
-        moving_average_fraction_, this->blobs_[0]->mutable_gpu_data());
-    int m = bottom[0]->count()/channels_;
-    Dtype bias_correction_factor = m > 1 ? Dtype(m)/(m-1) : 1;
-    caffe_gpu_axpby(variance_.count(), bias_correction_factor,
-        variance_.gpu_data(), moving_average_fraction_,
-        this->blobs_[1]->mutable_gpu_data());
-  }
-
-  // normalize variance
-  caffe_gpu_add_scalar(variance_.count(), eps_, variance_.mutable_gpu_data());
-  caffe_gpu_powx(variance_.count(), variance_.gpu_data(), Dtype(0.5),
-      variance_.mutable_gpu_data());
-
-  // replicate variance to input size
-  caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, channels_, 1, 1,
-      batch_sum_multiplier_.gpu_data(), variance_.gpu_data(), 0.,
-      num_by_chans_.mutable_gpu_data());
-  caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, channels_ * num,
-      spatial_dim, 1, 1., num_by_chans_.gpu_data(),
-      spatial_sum_multiplier_.gpu_data(), 0., temp_.mutable_gpu_data());
-  caffe_gpu_div(temp_.count(), top_data, temp_.gpu_data(), top_data);
-  // TODO(cdoersch): The caching is only needed because later in-place layers
-  //                 might clobber the data.  Can we skip this if they won't?
-  caffe_copy(x_norm_.count(), top_data,
-      x_norm_.mutable_gpu_data());
-}
-
-template <typename Dtype>
-void BatchNormLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
-  const Dtype* top_diff;
-  if (bottom[0] != top[0]) {
-    top_diff = top[0]->gpu_diff();
-  } else {
-    caffe_copy(x_norm_.count(), top[0]->gpu_diff(), x_norm_.mutable_gpu_diff());
-    top_diff = x_norm_.gpu_diff();
-  }
-  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-  if (use_global_stats_) {
-    caffe_gpu_div(temp_.count(), top_diff, temp_.gpu_data(), bottom_diff);
-    return;
-  }
-  const Dtype* top_data = x_norm_.gpu_data();
-  int num = bottom[0]->shape()[0];
-  int spatial_dim = bottom[0]->count()/(channels_*bottom[0]->shape(0));
-  // if Y = (X-mean(X))/(sqrt(var(X)+eps)), then
-  //
-  // dE(Y)/dX =
-  //   (dE/dY - mean(dE/dY) - mean(dE/dY \cdot Y) \cdot Y)
-  //     ./ sqrt(var(X) + eps)
-  //
-  // where \cdot and ./ are hadamard product and elementwise division,
-  // respectively, dE/dY is the top diff, and mean/var/sum are all computed
-  // along all dimensions except the channels dimension.  In the above
-  // equation, the operations allow for expansion (i.e. broadcast) along all
-  // dimensions except the channels dimension where required.
-
-  // sum(dE/dY \cdot Y)
-  caffe_gpu_mul(temp_.count(), top_data, top_diff, bottom_diff);
-  caffe_gpu_gemv<Dtype>(CblasNoTrans, channels_ * num, spatial_dim, 1.,
-      bottom_diff, spatial_sum_multiplier_.gpu_data(), 0.,
-      num_by_chans_.mutable_gpu_data());
-  caffe_gpu_gemv<Dtype>(CblasTrans, num, channels_, 1.,
-      num_by_chans_.gpu_data(), batch_sum_multiplier_.gpu_data(), 0.,
-      mean_.mutable_gpu_data());
-
-  // reshape (broadcast) the above
-  caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, channels_, 1, 1,
-      batch_sum_multiplier_.gpu_data(), mean_.gpu_data(), 0.,
-      num_by_chans_.mutable_gpu_data());
-  caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, channels_ * num,
-      spatial_dim, 1, 1., num_by_chans_.gpu_data(),
-      spatial_sum_multiplier_.gpu_data(), 0., bottom_diff);
-
-  // sum(dE/dY \cdot Y) \cdot Y
-  caffe_gpu_mul(temp_.count(), top_data, bottom_diff, bottom_diff);
-
-  // sum(dE/dY)-sum(dE/dY \cdot Y) \cdot Y
-  caffe_gpu_gemv<Dtype>(CblasNoTrans, channels_ * num, spatial_dim, 1.,
-      top_diff, spatial_sum_multiplier_.gpu_data(), 0.,
-      num_by_chans_.mutable_gpu_data());
-  caffe_gpu_gemv<Dtype>(CblasTrans, num, channels_, 1.,
-      num_by_chans_.gpu_data(), batch_sum_multiplier_.gpu_data(), 0.,
-      mean_.mutable_gpu_data());
-  // reshape (broadcast) the above to make
-  // sum(dE/dY)-sum(dE/dY \cdot Y) \cdot Y
-  caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, channels_, 1, 1,
-      batch_sum_multiplier_.gpu_data(), mean_.gpu_data(), 0.,
-      num_by_chans_.mutable_gpu_data());
-  caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num * channels_,
-      spatial_dim, 1, 1., num_by_chans_.gpu_data(),
-      spatial_sum_multiplier_.gpu_data(), 1., bottom_diff);
-
-  // dE/dY - mean(dE/dY)-mean(dE/dY \cdot Y) \cdot Y
-  caffe_gpu_axpby(temp_.count(), Dtype(1), top_diff,
-      Dtype(-1. / (num * spatial_dim)), bottom_diff);
-
-  // note: temp_ still contains sqrt(var(X)+eps), computed during the forward
-  // pass.
-  caffe_gpu_div(temp_.count(), bottom_diff, temp_.gpu_data(), bottom_diff);
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(BatchNormLayer);
-
-
-}  // namespace caffe
diff --git a/src/caffe/layers/batch_reindex_layer.cpp b/src/caffe/layers/batch_reindex_layer.cpp
index b14e56f..d9a80b7 100644
--- a/src/caffe/layers/batch_reindex_layer.cpp
+++ b/src/caffe/layers/batch_reindex_layer.cpp
@@ -68,10 +68,6 @@ void BatchReindexLayer<Dtype>::Backward_cpu(
   }
 }
 
-#ifdef CPU_ONLY
-STUB_GPU(BatchReindexLayer);
-#endif
-
 INSTANTIATE_CLASS(BatchReindexLayer);
 REGISTER_LAYER_CLASS(BatchReindex);
 
diff --git a/src/caffe/layers/batch_reindex_layer.cu b/src/caffe/layers/batch_reindex_layer.cu
deleted file mode 100644
index 83054d3..0000000
--- a/src/caffe/layers/batch_reindex_layer.cu
+++ /dev/null
@@ -1,106 +0,0 @@
-#include <algorithm>
-#include <utility>
-#include <vector>
-
-#include "caffe/layers/batch_reindex_layer.hpp"
-#include "caffe/util/math_functions.hpp"
-
-namespace caffe {
-
-template<typename Dtype>
-__global__ void BRForward(const int count, const int inner_dim, const Dtype* in,
-                          const Dtype* permut, Dtype* out) {
-  CUDA_KERNEL_LOOP(index, count) {
-    int n = index / (inner_dim);
-    int in_n = static_cast<int>(permut[n]);
-    out[index] = in[in_n * (inner_dim) + index % (inner_dim)];
-  }
-}
-
-template<typename Dtype>
-void BatchReindexLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-                                           const vector<Blob<Dtype>*>& top) {
-  check_batch_reindex(bottom[0]->shape(0), bottom[1]->count(),
-                      bottom[1]->cpu_data());
-  if (top[0]->count() == 0) {
-    return;
-  }
-  int threads = top[0]->count();
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  BRForward<Dtype> <<<CAFFE_GET_BLOCKS(threads), CAFFE_CUDA_NUM_THREADS>>>(
-      top[0]->count(), bottom[0]->count() / bottom[0]->shape(0),
-      bottom[0]->gpu_data(), bottom[1]->gpu_data(), top[0]->mutable_gpu_data());
-  CUDA_POST_KERNEL_CHECK;
-}
-
-template<typename Dtype>
-__global__ void BRBackward(const int count, const int inner_dim,
-                           const Dtype* in, const Dtype* top_indexes,
-                           const Dtype* begins, const Dtype* counts,
-                           Dtype* out) {
-  CUDA_KERNEL_LOOP(index, count) {
-    int n = index / (inner_dim);
-    out[index] = 0;
-    int lower = static_cast<int>(begins[n]);
-    int upper = lower + static_cast<int>(counts[n]);
-    for (int i = lower; i < upper; ++i) {
-      int in_n = static_cast<int>(top_indexes[i]);
-      out[index] += in[in_n * (inner_dim) + index % (inner_dim)];
-    }
-  }
-}
-
-template<typename Dtype>
-void BatchReindexLayer<Dtype>::Backward_gpu(
-    const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
-  CHECK(!propagate_down[1]) << "Cannot backprop to index.";
-  if (!propagate_down[0]) {
-    return;
-  }
-
-  vector<std::pair<int, int> > mapping;
-  const Dtype* perm = bottom[1]->cpu_data();
-  for (int i = 0; i < bottom[1]->count(); ++i) {
-    mapping.push_back(pair<int, int>(static_cast<int>(perm[i]), i));
-  }
-  std::sort(mapping.begin(), mapping.end(), pair_sort_first());
-
-  // Each element of the bottom diff is potentially the sum of many top diffs.
-  // However, we'd like each CUDA thread to handle exactly one output.  Hence,
-  // we first pre-compute a list of lists of indices that need to be summed for
-  // each output. `top_indexes` holds the data of this list of lists.  The
-  // k'th element of `begins` points to the location in `top_indexes` where the
-  // list for the k'th example begin, and the k'th element of `counts` is the
-  // length of that list.
-  vector<int> shape;
-  shape.push_back(bottom[1]->count());
-  Blob<Dtype> top_indexes(shape);
-  shape[0] = bottom[0]->shape(0);
-  Blob<Dtype> counts(shape);
-  Blob<Dtype> begins(shape);
-  Dtype* t_i_data = top_indexes.mutable_cpu_data();
-  Dtype* c_data = counts.mutable_cpu_data();
-  Dtype* b_data = begins.mutable_cpu_data();
-  caffe_set(begins.count(), Dtype(-1), b_data);
-  caffe_set(counts.count(), Dtype(0), c_data);
-  for (int i = 0; i < mapping.size(); ++i) {
-    t_i_data[i] = mapping[i].second;
-    if (b_data[mapping[i].first] == -1) {
-      b_data[mapping[i].first] = i;
-    }
-    c_data[mapping[i].first] += 1;
-  }
-
-  int threads = bottom[0]->count();
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  BRBackward<Dtype> <<<CAFFE_GET_BLOCKS(threads), CAFFE_CUDA_NUM_THREADS>>>(
-      bottom[0]->count(), bottom[0]->count() / bottom[0]->shape(0),
-      top[0]->gpu_diff(), top_indexes.gpu_data(), begins.gpu_data(),
-      counts.gpu_data(), bottom[0]->mutable_gpu_diff());
-  CUDA_POST_KERNEL_CHECK;
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(BatchReindexLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/bias_layer.cpp b/src/caffe/layers/bias_layer.cpp
index 4726a72..33b998b 100644
--- a/src/caffe/layers/bias_layer.cpp
+++ b/src/caffe/layers/bias_layer.cpp
@@ -111,10 +111,6 @@ void BiasLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
-#ifdef CPU_ONLY
-STUB_GPU(BiasLayer);
-#endif
-
 INSTANTIATE_CLASS(BiasLayer);
 REGISTER_LAYER_CLASS(Bias);
 
diff --git a/src/caffe/layers/bias_layer.cu b/src/caffe/layers/bias_layer.cu
deleted file mode 100644
index 8ac913a..0000000
--- a/src/caffe/layers/bias_layer.cu
+++ /dev/null
@@ -1,59 +0,0 @@
-#include <vector>
-
-#include "caffe/filler.hpp"
-#include "caffe/layers/bias_layer.hpp"
-#include "caffe/util/math_functions.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-__global__ void BiasForward(const int n, const Dtype* in,
-    const Dtype* bias, const int bias_dim, const int inner_dim,
-    Dtype* out) {
-  CUDA_KERNEL_LOOP(index, n) {
-    const int bias_index = (index / inner_dim) % bias_dim;
-    out[index] = in[index] + bias[bias_index];
-  }
-}
-
-template <typename Dtype>
-void BiasLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  const int count = top[0]->count();
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  const Dtype* bias_data =
-      ((bottom.size() > 1) ? bottom[1] : this->blobs_[0].get())->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  BiasForward<Dtype>  // NOLINT_NEXT_LINE(whitespace/operators)
-      <<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-      count, bottom_data, bias_data, bias_dim_, inner_dim_, top_data);
-}
-
-template <typename Dtype>
-void BiasLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  if (propagate_down[0] && bottom[0] != top[0]) {
-    const Dtype* top_diff = top[0]->gpu_diff();
-    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-    caffe_copy(bottom[0]->count(), top_diff, bottom_diff);
-  }
-  // in-place, we don't need to do anything with the data diff
-  const bool bias_param = (bottom.size() == 1);
-  if ((!bias_param && propagate_down[1]) ||
-      (bias_param && this->param_propagate_down_[0])) {
-    const Dtype* top_diff = top[0]->gpu_diff();
-    Dtype* bias_diff = (bias_param ? this->blobs_[0].get() : bottom[1])
-        ->mutable_gpu_diff();
-    bool accum = bias_param;
-    for (int n = 0; n < outer_dim_; ++n) {
-      caffe_gpu_gemv(CblasNoTrans, bias_dim_, inner_dim_, Dtype(1),
-          top_diff, bias_multiplier_.gpu_data(), Dtype(accum), bias_diff);
-      top_diff += dim_;
-      accum = true;
-    }
-  }
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(BiasLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/bnll_layer.cpp b/src/caffe/layers/bnll_layer.cpp
index 448d86d..a6d345c 100644
--- a/src/caffe/layers/bnll_layer.cpp
+++ b/src/caffe/layers/bnll_layer.cpp
@@ -37,10 +37,6 @@ void BNLLLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
-#ifdef CPU_ONLY
-STUB_GPU(BNLLLayer);
-#endif
-
 INSTANTIATE_CLASS(BNLLLayer);
 REGISTER_LAYER_CLASS(BNLL);
 
diff --git a/src/caffe/layers/bnll_layer.cu b/src/caffe/layers/bnll_layer.cu
deleted file mode 100644
index 8df8ef0..0000000
--- a/src/caffe/layers/bnll_layer.cu
+++ /dev/null
@@ -1,59 +0,0 @@
-#include <algorithm>
-#include <vector>
-
-#include "caffe/layers/bnll_layer.hpp"
-
-namespace caffe {
-
-const float kBNLL_THRESHOLD = 50.;
-
-template <typename Dtype>
-__global__ void BNLLForward(const int n, const Dtype* in, Dtype* out) {
-  CUDA_KERNEL_LOOP(index, n) {
-    out[index] = in[index] > 0 ?
-        in[index] + log(1. + exp(-in[index])) :
-        log(1. + exp(in[index]));
-  }
-}
-
-template <typename Dtype>
-void BNLLLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  const int count = bottom[0]->count();
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  BNLLForward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-      count, bottom_data, top_data);
-  CUDA_POST_KERNEL_CHECK;
-}
-
-template <typename Dtype>
-__global__ void BNLLBackward(const int n, const Dtype* in_diff,
-    const Dtype* in_data, Dtype* out_diff) {
-  CUDA_KERNEL_LOOP(index, n) {
-    Dtype expval = exp(min(in_data[index], Dtype(kBNLL_THRESHOLD)));
-    out_diff[index] = in_diff[index] * expval / (expval + 1.);
-  }
-}
-
-template <typename Dtype>
-void BNLLLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
-  if (propagate_down[0]) {
-    const Dtype* bottom_data = bottom[0]->gpu_data();
-    const Dtype* top_diff = top[0]->gpu_diff();
-    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-    const int count = bottom[0]->count();
-    // NOLINT_NEXT_LINE(whitespace/operators)
-    BNLLBackward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-        count, top_diff, bottom_data, bottom_diff);
-    CUDA_POST_KERNEL_CHECK;
-  }
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(BNLLLayer);
-
-
-}  // namespace caffe
diff --git a/src/caffe/layers/concat_layer.cpp b/src/caffe/layers/concat_layer.cpp
index 580bd47..22b33b5 100644
--- a/src/caffe/layers/concat_layer.cpp
+++ b/src/caffe/layers/concat_layer.cpp
@@ -94,10 +94,6 @@ void ConcatLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
-#ifdef CPU_ONLY
-STUB_GPU(ConcatLayer);
-#endif
-
 INSTANTIATE_CLASS(ConcatLayer);
 REGISTER_LAYER_CLASS(Concat);
 
diff --git a/src/caffe/layers/concat_layer.cu b/src/caffe/layers/concat_layer.cu
deleted file mode 100644
index a3a0bf6..0000000
--- a/src/caffe/layers/concat_layer.cu
+++ /dev/null
@@ -1,73 +0,0 @@
-#include <vector>
-
-#include "caffe/layers/concat_layer.hpp"
-#include "caffe/util/math_functions.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-__global__ void Concat(const int nthreads, const Dtype* in_data,
-    const bool forward, const int num_concats, const int concat_size,
-    const int top_concat_axis, const int bottom_concat_axis,
-    const int offset_concat_axis, Dtype* out_data) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    const int total_concat_size = concat_size * bottom_concat_axis;
-    const int concat_num = index / total_concat_size;
-    const int concat_index = index % total_concat_size;
-    const int top_index = concat_index +
-        (concat_num * top_concat_axis + offset_concat_axis) * concat_size;
-    if (forward) {
-      out_data[top_index] = in_data[index];
-    } else {
-      out_data[index] = in_data[top_index];
-    }
-  }
-}
-
-template <typename Dtype>
-void ConcatLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  if (bottom.size() == 1) { return; }
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  int offset_concat_axis = 0;
-  const int top_concat_axis = top[0]->shape(concat_axis_);
-  const bool kForward = true;
-  for (int i = 0; i < bottom.size(); ++i) {
-    const Dtype* bottom_data = bottom[i]->gpu_data();
-    const int bottom_concat_axis = bottom[i]->shape(concat_axis_);
-    const int bottom_concat_size = bottom_concat_axis * concat_input_size_;
-    const int nthreads = bottom_concat_size * num_concats_;
-    Concat<Dtype>  // NOLINT_NEXT_LINE(whitespace/operators)
-        <<<CAFFE_GET_BLOCKS(nthreads), CAFFE_CUDA_NUM_THREADS>>>(
-        nthreads, bottom_data, kForward, num_concats_, concat_input_size_,
-        top_concat_axis, bottom_concat_axis, offset_concat_axis, top_data);
-    offset_concat_axis += bottom_concat_axis;
-  }
-}
-
-template <typename Dtype>
-void ConcatLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  if (bottom.size() == 1) { return; }
-  const Dtype* top_diff = top[0]->gpu_diff();
-  int offset_concat_axis = 0;
-  const int top_concat_axis = top[0]->shape(concat_axis_);
-  const bool kForward = false;
-  for (int i = 0; i < bottom.size(); ++i) {
-    const int bottom_concat_axis = bottom[i]->shape(concat_axis_);
-    if (propagate_down[i]) {
-      Dtype* bottom_diff = bottom[i]->mutable_gpu_diff();
-      const int bottom_concat_size = bottom_concat_axis * concat_input_size_;
-      const int nthreads = bottom_concat_size * num_concats_;
-      Concat<Dtype>  // NOLINT_NEXT_LINE(whitespace/operators)
-          <<<CAFFE_GET_BLOCKS(nthreads), CAFFE_CUDA_NUM_THREADS>>>(
-          nthreads, top_diff, kForward, num_concats_, concat_input_size_,
-          top_concat_axis, bottom_concat_axis, offset_concat_axis, bottom_diff);
-    }
-    offset_concat_axis += bottom_concat_axis;
-  }
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(ConcatLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/contrastive_loss_layer.cpp b/src/caffe/layers/contrastive_loss_layer.cpp
index 599e178..fd4589f 100644
--- a/src/caffe/layers/contrastive_loss_layer.cpp
+++ b/src/caffe/layers/contrastive_loss_layer.cpp
@@ -110,10 +110,6 @@ void ContrastiveLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
-#ifdef CPU_ONLY
-STUB_GPU(ContrastiveLossLayer);
-#endif
-
 INSTANTIATE_CLASS(ContrastiveLossLayer);
 REGISTER_LAYER_CLASS(ContrastiveLoss);
 
diff --git a/src/caffe/layers/contrastive_loss_layer.cu b/src/caffe/layers/contrastive_loss_layer.cu
deleted file mode 100644
index fd7d67c..0000000
--- a/src/caffe/layers/contrastive_loss_layer.cu
+++ /dev/null
@@ -1,109 +0,0 @@
-#include <algorithm>
-#include <vector>
-
-#include "caffe/layers/contrastive_loss_layer.hpp"
-#include "caffe/util/math_functions.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void ContrastiveLossLayer<Dtype>::Forward_gpu(
-    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  const int count = bottom[0]->count();
-  caffe_gpu_sub(
-      count,
-      bottom[0]->gpu_data(),  // a
-      bottom[1]->gpu_data(),  // b
-      diff_.mutable_gpu_data());  // a_i-b_i
-  caffe_gpu_powx(
-      count,
-      diff_.mutable_gpu_data(),  // a_i-b_i
-      Dtype(2),
-      diff_sq_.mutable_gpu_data());  // (a_i-b_i)^2
-  caffe_gpu_gemv(
-      CblasNoTrans,
-      bottom[0]->num(),
-      bottom[0]->channels(),
-      Dtype(1.0),
-      diff_sq_.gpu_data(),  // (a_i-b_i)^2
-      summer_vec_.gpu_data(),
-      Dtype(0.0),
-      dist_sq_.mutable_gpu_data());  // \Sum (a_i-b_i)^2
-  Dtype margin = this->layer_param_.contrastive_loss_param().margin();
-  bool legacy_version =
-      this->layer_param_.contrastive_loss_param().legacy_version();
-  Dtype loss(0.0);
-  for (int i = 0; i < bottom[0]->num(); ++i) {
-    if (static_cast<int>(bottom[2]->cpu_data()[i])) {  // similar pairs
-      loss += dist_sq_.cpu_data()[i];
-    } else {  // dissimilar pairs
-      if (legacy_version) {
-        loss += std::max(margin - dist_sq_.cpu_data()[i], Dtype(0.0));
-      } else {
-        Dtype dist = std::max(margin - sqrt(dist_sq_.cpu_data()[i]),
-                              Dtype(0.0));
-        loss += dist*dist;
-      }
-    }
-  }
-  loss = loss / static_cast<Dtype>(bottom[0]->num()) / Dtype(2);
-  top[0]->mutable_cpu_data()[0] = loss;
-}
-
-template <typename Dtype>
-__global__ void CLLBackward(const int count, const int channels,
-    const Dtype margin, const bool legacy_version, const Dtype alpha,
-    const Dtype* y, const Dtype* diff, const Dtype* dist_sq,
-    Dtype *bottom_diff) {
-  CUDA_KERNEL_LOOP(i, count) {
-    int n = i / channels;  // the num index, to access y and dist_sq
-    if (static_cast<int>(y[n])) {  // similar pairs
-      bottom_diff[i] = alpha * diff[i];
-    } else {  // dissimilar pairs
-      Dtype mdist(0.0);
-      Dtype beta(0.0);
-      if (legacy_version) {
-        mdist = (margin - dist_sq[n]);
-        beta = -alpha;
-      } else {
-        Dtype dist = sqrt(dist_sq[n]);
-        mdist = (margin - dist);
-        beta = -alpha * mdist / (dist + Dtype(1e-4)) * diff[i];
-      }
-      if (mdist > 0.0) {
-        bottom_diff[i] = beta;
-      } else {
-        bottom_diff[i] = 0;
-      }
-    }
-  }
-}
-
-template <typename Dtype>
-void ContrastiveLossLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  for (int i = 0; i < 2; ++i) {
-    if (propagate_down[i]) {
-      const int count = bottom[0]->count();
-      const int channels = bottom[0]->channels();
-      Dtype margin = this->layer_param_.contrastive_loss_param().margin();
-      const bool legacy_version =
-          this->layer_param_.contrastive_loss_param().legacy_version();
-      const Dtype sign = (i == 0) ? 1 : -1;
-      const Dtype alpha = sign * top[0]->cpu_diff()[0] /
-          static_cast<Dtype>(bottom[0]->num());
-      // NOLINT_NEXT_LINE(whitespace/operators)
-      CLLBackward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-          count, channels, margin, legacy_version, alpha,
-          bottom[2]->gpu_data(),  // pair similarity 0 or 1
-          diff_.gpu_data(),  // the cached eltwise difference between a and b
-          dist_sq_.gpu_data(),  // the cached square distance between a and b
-          bottom[i]->mutable_gpu_diff());
-      CUDA_POST_KERNEL_CHECK;
-    }
-  }
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(ContrastiveLossLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/conv_layer.cpp b/src/caffe/layers/conv_layer.cpp
index 5d522ab..3b75df4 100644
--- a/src/caffe/layers/conv_layer.cpp
+++ b/src/caffe/layers/conv_layer.cpp
@@ -72,10 +72,6 @@ void ConvolutionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
-#ifdef CPU_ONLY
-STUB_GPU(ConvolutionLayer);
-#endif
-
 INSTANTIATE_CLASS(ConvolutionLayer);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/conv_layer.cu b/src/caffe/layers/conv_layer.cu
deleted file mode 100644
index d06e4b6..0000000
--- a/src/caffe/layers/conv_layer.cu
+++ /dev/null
@@ -1,60 +0,0 @@
-#include <vector>
-
-#include "caffe/layers/conv_layer.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void ConvolutionLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  const Dtype* weight = this->blobs_[0]->gpu_data();
-  for (int i = 0; i < bottom.size(); ++i) {
-    const Dtype* bottom_data = bottom[i]->gpu_data();
-    Dtype* top_data = top[i]->mutable_gpu_data();
-    for (int n = 0; n < this->num_; ++n) {
-      this->forward_gpu_gemm(bottom_data + n * this->bottom_dim_, weight,
-          top_data + n * this->top_dim_);
-      if (this->bias_term_) {
-        const Dtype* bias = this->blobs_[1]->gpu_data();
-        this->forward_gpu_bias(top_data + n * this->top_dim_, bias);
-      }
-    }
-  }
-}
-
-template <typename Dtype>
-void ConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  const Dtype* weight = this->blobs_[0]->gpu_data();
-  Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff();
-  for (int i = 0; i < top.size(); ++i) {
-    const Dtype* top_diff = top[i]->gpu_diff();
-    // Bias gradient, if necessary.
-    if (this->bias_term_ && this->param_propagate_down_[1]) {
-      Dtype* bias_diff = this->blobs_[1]->mutable_gpu_diff();
-      for (int n = 0; n < this->num_; ++n) {
-        this->backward_gpu_bias(bias_diff, top_diff + n * this->top_dim_);
-      }
-    }
-    if (this->param_propagate_down_[0] || propagate_down[i]) {
-      const Dtype* bottom_data = bottom[i]->gpu_data();
-      Dtype* bottom_diff = bottom[i]->mutable_gpu_diff();
-      for (int n = 0; n < this->num_; ++n) {
-        // gradient w.r.t. weight. Note that we will accumulate diffs.
-        if (this->param_propagate_down_[0]) {
-          this->weight_gpu_gemm(bottom_data + n * this->bottom_dim_,
-              top_diff + n * this->top_dim_, weight_diff);
-        }
-        // gradient w.r.t. bottom data, if necessary.
-        if (propagate_down[i]) {
-          this->backward_gpu_gemm(top_diff + n * this->top_dim_, weight,
-              bottom_diff + n * this->bottom_dim_);
-        }
-      }
-    }
-  }
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(ConvolutionLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/crop_layer.cpp b/src/caffe/layers/crop_layer.cpp
index aecdcd6..b5a3312 100644
--- a/src/caffe/layers/crop_layer.cpp
+++ b/src/caffe/layers/crop_layer.cpp
@@ -133,10 +133,6 @@ void CropLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
-#ifdef CPU_ONLY
-STUB_GPU(CropLayer);
-#endif
-
 INSTANTIATE_CLASS(CropLayer);
 REGISTER_LAYER_CLASS(Crop);
 
diff --git a/src/caffe/layers/crop_layer.cu b/src/caffe/layers/crop_layer.cu
deleted file mode 100644
index f78cecb..0000000
--- a/src/caffe/layers/crop_layer.cu
+++ /dev/null
@@ -1,115 +0,0 @@
-#include <vector>
-
-#include "caffe/layers/crop_layer.hpp"
-
-namespace caffe {
-
-// Copy (one line per thread) from one array to another, with arbitrary
-// strides in the last two dimensions.
-template <typename Dtype>
-__global__ void copy_kernel(const int n, const int height, const int width,
-    const int src_outer_stride, const int src_inner_stride,
-    const int dest_outer_stride, const int dest_inner_stride,
-    const Dtype* src, Dtype* dest) {
-  CUDA_KERNEL_LOOP(index, n) {
-    int src_start = index / height * src_outer_stride
-                  + index % height * src_inner_stride;
-    int dest_start = index / height * dest_outer_stride
-                   + index % height * dest_inner_stride;
-    for (int i = 0; i < width; ++i) {
-      dest[dest_start + i] = src[src_start + i];
-    }
-  }
-}
-
-template <typename Dtype>
-void CropLayer<Dtype>::crop_copy_gpu(const vector<Blob<Dtype>*>& bottom,
-             const vector<Blob<Dtype>*>& top,
-             const vector<int>& offsets,
-             vector<int> indices,
-             int cur_dim,
-             const Dtype* src_data,
-             Dtype* dest_data,
-             bool is_forward) {
-  if (cur_dim + 2 < top[0]->num_axes()) {
-    // We are not yet at the final dimension, call copy recursivley
-    for (int i = 0; i < top[0]->shape(cur_dim); ++i) {
-      indices[cur_dim] = i;
-      crop_copy_gpu(bottom, top, offsets, indices, cur_dim+1,
-                src_data, dest_data, is_forward);
-    }
-  } else {
-    // We are at the last two dimensions, which are stored continously in memory
-    // With (N,C,H,W)
-    //      (0,1,2,3) cur_dim   -> H
-    //                cur_dim+1 -> W
-    const int lines = top[0]->shape(cur_dim);
-    const int height = top[0]->shape(cur_dim);
-    const int width = top[0]->shape(cur_dim+1);
-    std::vector<int> ind_off(cur_dim+2, 0);
-    for (int j = 0; j < cur_dim; ++j) {
-        ind_off[j] = indices[j] + offsets[j];
-    }
-    ind_off[cur_dim] = offsets[cur_dim];
-    ind_off[cur_dim+1] = offsets[cur_dim+1];
-    // Compute copy strides
-    const int src_outer_stride =
-        bottom[0]->shape(cur_dim)*bottom[0]->shape(cur_dim+1);
-    const int src_inner_stride = bottom[0]->shape(cur_dim+1);
-    const int dest_outer_stride =
-        top[0]->shape(cur_dim)*top[0]->shape(cur_dim+1);
-    const int dest_inner_stride = top[0]->shape(cur_dim+1);
-
-    if (is_forward) {
-      const Dtype* bottom_data = bottom[0]->gpu_data() +
-          bottom[0]->offset(ind_off);
-      Dtype* top_data = top[0]->mutable_gpu_data() +
-          top[0]->offset(indices);
-      // NOLINT_NEXT_LINE(whitespace/operators)
-      copy_kernel<<<CAFFE_GET_BLOCKS(lines), CAFFE_CUDA_NUM_THREADS>>>(
-          lines, height, width,
-          src_outer_stride, src_inner_stride,
-          dest_outer_stride, dest_inner_stride,
-          bottom_data, top_data);
-
-    } else {
-      const Dtype* top_diff = top[0]->gpu_diff() +
-          top[0]->offset(indices);
-      Dtype* bottom_diff = bottom[0]->mutable_gpu_diff() +
-          bottom[0]->offset(ind_off);
-      // NOLINT_NEXT_LINE(whitespace/operators)
-      copy_kernel<<<CAFFE_GET_BLOCKS(lines), CAFFE_CUDA_NUM_THREADS>>>(
-          lines, height, width,
-          dest_outer_stride, dest_inner_stride,
-          src_outer_stride, src_inner_stride,
-          top_diff, bottom_diff);
-    }
-  }
-}
-
-template <typename Dtype>
-void CropLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  std::vector<int> indices(top[0]->num_axes(), 0);
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  crop_copy_gpu(bottom, top, offsets, indices, 0, bottom_data, top_data, true);
-}
-
-template <typename Dtype>
-void CropLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  const Dtype* top_diff = top[0]->gpu_diff();
-  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-
-  if (propagate_down[0]) {
-    caffe_gpu_set(bottom[0]->count(), static_cast<Dtype>(0), bottom_diff);
-    std::vector<int> indices(top[0]->num_axes(), 0);
-    crop_copy_gpu(bottom, top, offsets, indices, 0, top_diff, bottom_diff,
-                  false);
-  }
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(CropLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/cudnn_conv_layer.cpp b/src/caffe/layers/cudnn_conv_layer.cpp
deleted file mode 100644
index 1987fb0..0000000
--- a/src/caffe/layers/cudnn_conv_layer.cpp
+++ /dev/null
@@ -1,268 +0,0 @@
-#ifdef USE_CUDNN
-#include <algorithm>
-#include <vector>
-
-#include "caffe/layers/cudnn_conv_layer.hpp"
-
-namespace caffe {
-
-// Set to three for the benefit of the backward pass, which
-// can use separate streams for calculating the gradient w.r.t.
-// bias, filter weights, and bottom data for each group independently
-#define CUDNN_STREAMS_PER_GROUP 3
-
-/**
- * TODO(dox) explain cuDNN interface
- */
-template <typename Dtype>
-void CuDNNConvolutionLayer<Dtype>::LayerSetUp(
-    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  ConvolutionLayer<Dtype>::LayerSetUp(bottom, top);
-  // Initialize CUDA streams and cuDNN.
-  stream_         = new cudaStream_t[this->group_ * CUDNN_STREAMS_PER_GROUP];
-  handle_         = new cudnnHandle_t[this->group_ * CUDNN_STREAMS_PER_GROUP];
-
-  // Initialize algorithm arrays
-  fwd_algo_       = new cudnnConvolutionFwdAlgo_t[bottom.size()];
-  bwd_filter_algo_= new cudnnConvolutionBwdFilterAlgo_t[bottom.size()];
-  bwd_data_algo_  = new cudnnConvolutionBwdDataAlgo_t[bottom.size()];
-
-  // initialize size arrays
-  workspace_fwd_sizes_ = new size_t[bottom.size()];
-  workspace_bwd_filter_sizes_ = new size_t[bottom.size()];
-  workspace_bwd_data_sizes_ = new size_t[bottom.size()];
-
-  // workspace data
-  workspaceSizeInBytes = 0;
-  workspaceData = NULL;
-  workspace = new void*[this->group_ * CUDNN_STREAMS_PER_GROUP];
-
-  for (size_t i = 0; i < bottom.size(); ++i) {
-    // initialize all to default algorithms
-    fwd_algo_[i] = (cudnnConvolutionFwdAlgo_t)0;
-    bwd_filter_algo_[i] = (cudnnConvolutionBwdFilterAlgo_t)0;
-    bwd_data_algo_[i] = (cudnnConvolutionBwdDataAlgo_t)0;
-    // default algorithms don't require workspace
-    workspace_fwd_sizes_[i] = 0;
-    workspace_bwd_data_sizes_[i] = 0;
-    workspace_bwd_filter_sizes_[i] = 0;
-  }
-
-  for (int g = 0; g < this->group_ * CUDNN_STREAMS_PER_GROUP; g++) {
-    CUDA_CHECK(cudaStreamCreate(&stream_[g]));
-    CUDNN_CHECK(cudnnCreate(&handle_[g]));
-    CUDNN_CHECK(cudnnSetStream(handle_[g], stream_[g]));
-    workspace[g] = NULL;
-  }
-
-  // Set the indexing parameters.
-  bias_offset_ = (this->num_output_ / this->group_);
-
-  // Create filter descriptor.
-  const int* kernel_shape_data = this->kernel_shape_.cpu_data();
-  const int kernel_h = kernel_shape_data[0];
-  const int kernel_w = kernel_shape_data[1];
-  cudnn::createFilterDesc<Dtype>(&filter_desc_,
-      this->num_output_ / this->group_, this->channels_ / this->group_,
-      kernel_h, kernel_w);
-
-  // Create tensor descriptor(s) for data and corresponding convolution(s).
-  for (int i = 0; i < bottom.size(); i++) {
-    cudnnTensorDescriptor_t bottom_desc;
-    cudnn::createTensor4dDesc<Dtype>(&bottom_desc);
-    bottom_descs_.push_back(bottom_desc);
-    cudnnTensorDescriptor_t top_desc;
-    cudnn::createTensor4dDesc<Dtype>(&top_desc);
-    top_descs_.push_back(top_desc);
-    cudnnConvolutionDescriptor_t conv_desc;
-    cudnn::createConvolutionDesc<Dtype>(&conv_desc);
-    conv_descs_.push_back(conv_desc);
-  }
-
-  // Tensor descriptor for bias.
-  if (this->bias_term_) {
-    cudnn::createTensor4dDesc<Dtype>(&bias_desc_);
-  }
-
-  handles_setup_ = true;
-}
-
-template <typename Dtype>
-void CuDNNConvolutionLayer<Dtype>::Reshape(
-    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  ConvolutionLayer<Dtype>::Reshape(bottom, top);
-  CHECK_EQ(2, this->num_spatial_axes_)
-      << "CuDNNConvolution input must have 2 spatial axes "
-      << "(e.g., height and width). "
-      << "Use 'engine: CAFFE' for general ND convolution.";
-  bottom_offset_ = this->bottom_dim_ / this->group_;
-  top_offset_ = this->top_dim_ / this->group_;
-  const int height = bottom[0]->shape(this->channel_axis_ + 1);
-  const int width = bottom[0]->shape(this->channel_axis_ + 2);
-  const int height_out = top[0]->shape(this->channel_axis_ + 1);
-  const int width_out = top[0]->shape(this->channel_axis_ + 2);
-  const int* pad_data = this->pad_.cpu_data();
-  const int pad_h = pad_data[0];
-  const int pad_w = pad_data[1];
-  const int* stride_data = this->stride_.cpu_data();
-  const int stride_h = stride_data[0];
-  const int stride_w = stride_data[1];
-
-  // Specify workspace limit for kernels directly until we have a
-  // planning strategy and a rewrite of Caffe's GPU memory mangagement
-  size_t workspace_limit_bytes = 8*1024*1024;
-
-  for (int i = 0; i < bottom.size(); i++) {
-    cudnn::setTensor4dDesc<Dtype>(&bottom_descs_[i],
-        this->num_,
-        this->channels_ / this->group_, height, width,
-        this->channels_ * height * width,
-        height * width, width, 1);
-    cudnn::setTensor4dDesc<Dtype>(&top_descs_[i],
-        this->num_,
-        this->num_output_ / this->group_, height_out, width_out,
-        this->num_output_ * this->out_spatial_dim_,
-        this->out_spatial_dim_, width_out, 1);
-    cudnn::setConvolutionDesc<Dtype>(&conv_descs_[i], bottom_descs_[i],
-        filter_desc_, pad_h, pad_w,
-        stride_h, stride_w);
-
-    // choose forward and backward algorithms + workspace(s)
-    CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(handle_[0],
-      bottom_descs_[i],
-      filter_desc_,
-      conv_descs_[i],
-      top_descs_[i],
-      CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
-      workspace_limit_bytes,
-      &fwd_algo_[i]));
-
-    CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(handle_[0],
-      bottom_descs_[i],
-      filter_desc_,
-      conv_descs_[i],
-      top_descs_[i],
-      fwd_algo_[i],
-      &(workspace_fwd_sizes_[i])));
-
-    // choose backward algorithm for filter
-    CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm(handle_[0],
-          bottom_descs_[i], top_descs_[i], conv_descs_[i], filter_desc_,
-          CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
-          workspace_limit_bytes, &bwd_filter_algo_[i]) );
-
-    // get workspace for backwards filter algorithm
-    CUDNN_CHECK(cudnnGetConvolutionBackwardFilterWorkspaceSize(handle_[0],
-          bottom_descs_[i], top_descs_[i], conv_descs_[i], filter_desc_,
-          bwd_filter_algo_[i], &workspace_bwd_filter_sizes_[i]));
-
-    // choose backward algo for data
-    CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm(handle_[0],
-          filter_desc_, top_descs_[i], conv_descs_[i], bottom_descs_[i],
-          CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
-        workspace_limit_bytes, &bwd_data_algo_[i]));
-
-    // get workspace size
-    CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize(handle_[0],
-          filter_desc_, top_descs_[i], conv_descs_[i], bottom_descs_[i],
-          bwd_data_algo_[i], &workspace_bwd_data_sizes_[i]) );
-  }
-
-  // reduce over all workspace sizes to get a maximum to allocate / reallocate
-  size_t total_workspace_fwd = 0;
-  size_t total_workspace_bwd_data = 0;
-  size_t total_workspace_bwd_filter = 0;
-
-  for (size_t i = 0; i < bottom.size(); i++) {
-    total_workspace_fwd        = std::max(total_workspace_fwd,
-                                     workspace_fwd_sizes_[i]);
-    total_workspace_bwd_data   = std::max(total_workspace_bwd_data,
-                                     workspace_bwd_data_sizes_[i]);
-    total_workspace_bwd_filter = std::max(total_workspace_bwd_filter,
-                                     workspace_bwd_filter_sizes_[i]);
-  }
-  // get max over all operations
-  size_t max_workspace = std::max(total_workspace_fwd,
-                             total_workspace_bwd_data);
-  max_workspace = std::max(max_workspace, total_workspace_bwd_filter);
-  // ensure all groups have enough workspace
-  size_t total_max_workspace = max_workspace *
-                               (this->group_ * CUDNN_STREAMS_PER_GROUP);
-
-  // this is the total amount of storage needed over all groups + streams
-  if (total_max_workspace > workspaceSizeInBytes) {
-    DLOG(INFO) << "Reallocating workspace storage: " << total_max_workspace;
-    workspaceSizeInBytes = total_max_workspace;
-
-    // free the existing workspace and allocate a new (larger) one
-    cudaFree(this->workspaceData);
-
-    cudaError_t err = cudaMalloc(&(this->workspaceData), workspaceSizeInBytes);
-    if (err != cudaSuccess) {
-      // force zero memory path
-      for (int i = 0; i < bottom.size(); i++) {
-        workspace_fwd_sizes_[i] = 0;
-        workspace_bwd_filter_sizes_[i] = 0;
-        workspace_bwd_data_sizes_[i] = 0;
-        fwd_algo_[i] = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
-        bwd_filter_algo_[i] = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
-        bwd_data_algo_[i] = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
-      }
-
-      // NULL out all workspace pointers
-      for (int g = 0; g < (this->group_ * CUDNN_STREAMS_PER_GROUP); g++) {
-        workspace[g] = NULL;
-      }
-      // NULL out underlying data
-      workspaceData = NULL;
-      workspaceSizeInBytes = 0;
-    }
-
-    // if we succeed in the allocation, set pointer aliases for workspaces
-    for (int g = 0; g < (this->group_ * CUDNN_STREAMS_PER_GROUP); g++) {
-      workspace[g] = reinterpret_cast<char *>(workspaceData) + g*max_workspace;
-    }
-  }
-
-  // Tensor descriptor for bias.
-  if (this->bias_term_) {
-    cudnn::setTensor4dDesc<Dtype>(&bias_desc_,
-        1, this->num_output_ / this->group_, 1, 1);
-  }
-}
-
-template <typename Dtype>
-CuDNNConvolutionLayer<Dtype>::~CuDNNConvolutionLayer() {
-  // Check that handles have been setup before destroying.
-  if (!handles_setup_) { return; }
-
-  for (int i = 0; i < bottom_descs_.size(); i++) {
-    cudnnDestroyTensorDescriptor(bottom_descs_[i]);
-    cudnnDestroyTensorDescriptor(top_descs_[i]);
-    cudnnDestroyConvolutionDescriptor(conv_descs_[i]);
-  }
-  if (this->bias_term_) {
-    cudnnDestroyTensorDescriptor(bias_desc_);
-  }
-  cudnnDestroyFilterDescriptor(filter_desc_);
-
-  for (int g = 0; g < this->group_ * CUDNN_STREAMS_PER_GROUP; g++) {
-    cudaStreamDestroy(stream_[g]);
-    cudnnDestroy(handle_[g]);
-  }
-
-  cudaFree(workspaceData);
-  delete [] stream_;
-  delete [] handle_;
-  delete [] fwd_algo_;
-  delete [] bwd_filter_algo_;
-  delete [] bwd_data_algo_;
-  delete [] workspace_fwd_sizes_;
-  delete [] workspace_bwd_data_sizes_;
-  delete [] workspace_bwd_filter_sizes_;
-}
-
-INSTANTIATE_CLASS(CuDNNConvolutionLayer);
-
-}   // namespace caffe
-#endif
diff --git a/src/caffe/layers/cudnn_conv_layer.cu b/src/caffe/layers/cudnn_conv_layer.cu
deleted file mode 100644
index 8bc5346..0000000
--- a/src/caffe/layers/cudnn_conv_layer.cu
+++ /dev/null
@@ -1,118 +0,0 @@
-#ifdef USE_CUDNN
-#include <vector>
-
-#include "caffe/layers/cudnn_conv_layer.hpp"
-
-namespace caffe {
-
-__global__ void sync_conv_groups() { }
-
-template <typename Dtype>
-void CuDNNConvolutionLayer<Dtype>::Forward_gpu(
-    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  const Dtype* weight = this->blobs_[0]->gpu_data();
-  for (int i = 0; i < bottom.size(); ++i) {
-    const Dtype* bottom_data = bottom[i]->gpu_data();
-    Dtype* top_data = top[i]->mutable_gpu_data();
-
-    // Forward through cuDNN in parallel over groups.
-    for (int g = 0; g < this->group_; g++) {
-      // Filters.
-      CUDNN_CHECK(cudnnConvolutionForward(handle_[g],
-            cudnn::dataType<Dtype>::one,
-            bottom_descs_[i], bottom_data + bottom_offset_ * g,
-            filter_desc_, weight + this->weight_offset_ * g,
-            conv_descs_[i],
-            fwd_algo_[i], workspace[g], workspace_fwd_sizes_[i],
-            cudnn::dataType<Dtype>::zero,
-            top_descs_[i], top_data + top_offset_ * g));
-
-      // Bias.
-      if (this->bias_term_) {
-        const Dtype* bias_data = this->blobs_[1]->gpu_data();
-        CUDNN_CHECK(cudnnAddTensor(handle_[g],
-              cudnn::dataType<Dtype>::one,
-              bias_desc_, bias_data + bias_offset_ * g,
-              cudnn::dataType<Dtype>::one,
-              top_descs_[i], top_data + top_offset_ * g));
-      }
-    }
-
-    // Synchronize the work across groups, each of which went into its own
-    // stream, by launching an empty kernel into the default (null) stream.
-    // NOLINT_NEXT_LINE(whitespace/operators)
-    sync_conv_groups<<<1, 1>>>();
-  }
-}
-
-template <typename Dtype>
-void CuDNNConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  const Dtype* weight = NULL;
-  Dtype* weight_diff = NULL;
-  if (this->param_propagate_down_[0]) {
-    weight = this->blobs_[0]->gpu_data();
-    weight_diff = this->blobs_[0]->mutable_gpu_diff();
-  }
-  Dtype* bias_diff = NULL;
-  if (this->bias_term_ && this->param_propagate_down_[1]) {
-    bias_diff = this->blobs_[1]->mutable_gpu_diff();
-  }
-  for (int i = 0; i < top.size(); ++i) {
-    const Dtype* top_diff = top[i]->gpu_diff();
-    // Backward through cuDNN in parallel over groups and gradients.
-    for (int g = 0; g < this->group_; g++) {
-      // Gradient w.r.t. bias.
-      if (this->bias_term_ && this->param_propagate_down_[1]) {
-        CUDNN_CHECK(cudnnConvolutionBackwardBias(handle_[0*this->group_ + g],
-              cudnn::dataType<Dtype>::one,
-              top_descs_[i],  top_diff + top_offset_ * g,
-              cudnn::dataType<Dtype>::one,
-              bias_desc_, bias_diff + bias_offset_ * g));
-      }
-
-      // Gradient w.r.t. weights.
-      if (this->param_propagate_down_[0]) {
-        const Dtype* bottom_data = bottom[i]->gpu_data();
-        CUDNN_CHECK(cudnnConvolutionBackwardFilter(
-              handle_[1*this->group_ + g],
-              cudnn::dataType<Dtype>::one,
-              bottom_descs_[i], bottom_data + bottom_offset_ * g,
-              top_descs_[i],    top_diff + top_offset_ * g,
-              conv_descs_[i],
-              bwd_filter_algo_[i], workspace[1*this->group_ + g],
-              workspace_bwd_filter_sizes_[i],
-              cudnn::dataType<Dtype>::one,
-              filter_desc_, weight_diff + this->weight_offset_ * g));
-      }
-
-      // Gradient w.r.t. bottom data.
-      if (propagate_down[i]) {
-        if (weight == NULL) {
-          weight = this->blobs_[0]->gpu_data();
-        }
-        Dtype* bottom_diff = bottom[i]->mutable_gpu_diff();
-        CUDNN_CHECK(cudnnConvolutionBackwardData(
-              handle_[2*this->group_ + g],
-              cudnn::dataType<Dtype>::one,
-              filter_desc_, weight + this->weight_offset_ * g,
-              top_descs_[i], top_diff + top_offset_ * g,
-              conv_descs_[i],
-              bwd_data_algo_[i], workspace[2*this->group_ + g],
-              workspace_bwd_data_sizes_[i],
-              cudnn::dataType<Dtype>::zero,
-              bottom_descs_[i], bottom_diff + bottom_offset_ * g));
-      }
-    }
-
-    // Synchronize the work across groups, each of which went into its own
-    // stream, by launching an empty kernel into the default (null) stream.
-    // NOLINT_NEXT_LINE(whitespace/operators)
-    sync_conv_groups<<<1, 1>>>();
-  }
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(CuDNNConvolutionLayer);
-
-}  // namespace caffe
-#endif
diff --git a/src/caffe/layers/cudnn_lcn_layer.cpp b/src/caffe/layers/cudnn_lcn_layer.cpp
deleted file mode 100644
index 9c09bf2..0000000
--- a/src/caffe/layers/cudnn_lcn_layer.cpp
+++ /dev/null
@@ -1,73 +0,0 @@
-#ifdef USE_CUDNN
-#include <vector>
-
-#include "caffe/layers/cudnn_lcn_layer.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void CuDNNLCNLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  LRNLayer<Dtype>::LayerSetUp(bottom, top);
-
-  CUDNN_CHECK(cudnnCreate(&handle_));
-  CUDNN_CHECK(cudnnCreateLRNDescriptor(&norm_desc_));
-  cudnn::createTensor4dDesc<Dtype>(&bottom_desc_);
-  cudnn::createTensor4dDesc<Dtype>(&top_desc_);
-
-  // create a LRN handle
-  handles_setup_ = true;
-
-  size_ = this->layer_param().lrn_param().local_size();
-  pre_pad_ = (size_ - 1) / 2;
-  alpha_ = this->layer_param().lrn_param().alpha();
-  beta_ = this->layer_param().lrn_param().beta();
-  k_ = this->layer_param().lrn_param().k();
-}
-
-template <typename Dtype>
-void CuDNNLCNLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  LRNLayer<Dtype>::Reshape(bottom, top);
-  cudnn::setTensor4dDesc<Dtype>(&bottom_desc_, bottom[0]->num(),
-      this->channels_, this->height_, this->width_);
-  cudnn::setTensor4dDesc<Dtype>(&top_desc_, bottom[0]->num(),
-      this->channels_, this->height_, this->width_);
-  CUDNN_CHECK(cudnnSetLRNDescriptor(norm_desc_, size_, alpha_, beta_, k_));
-
-  // allocate / reallocate tempData buffers
-  size_t totalSizeInBytes = sizeof(Dtype)*bottom[0]->num()* \
-                            this->channels_*this->height_*this->width_;
-
-  if (totalSizeInBytes > tempDataSize) {
-    tempDataSize = totalSizeInBytes;
-
-    cudaFree(tempData1);
-    cudaFree(tempData2);
-
-    // allocate new buffers
-    CUDA_CHECK(cudaMalloc(&tempData1, totalSizeInBytes));
-    CUDA_CHECK(cudaMalloc(&tempData2, totalSizeInBytes));
-  }
-}
-
-template <typename Dtype>
-CuDNNLCNLayer<Dtype>::~CuDNNLCNLayer() {
-  // Check that handles have been setup before destroying.
-  if (!handles_setup_) { return; }
-
-  cudnnDestroyTensorDescriptor(bottom_desc_);
-  cudnnDestroyTensorDescriptor(top_desc_);
-
-  // destroy LRN handle
-  cudnnDestroy(handle_);
-
-  // free temp buffers
-  cudaFree(tempData1);
-  cudaFree(tempData2);
-}
-
-INSTANTIATE_CLASS(CuDNNLCNLayer);
-
-}   // namespace caffe
-#endif
diff --git a/src/caffe/layers/cudnn_lcn_layer.cu b/src/caffe/layers/cudnn_lcn_layer.cu
deleted file mode 100644
index b44ef47..0000000
--- a/src/caffe/layers/cudnn_lcn_layer.cu
+++ /dev/null
@@ -1,46 +0,0 @@
-#ifdef USE_CUDNN
-#include <vector>
-
-#include "caffe/layers/cudnn_lcn_layer.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void CuDNNLCNLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-
-  CUDNN_CHECK(cudnnDivisiveNormalizationForward(
-        handle_, norm_desc_, CUDNN_DIVNORM_PRECOMPUTED_MEANS,
-        cudnn::dataType<Dtype>::one,
-        bottom_desc_, bottom_data,
-        NULL,  // srcMeansData
-        this->tempData1, this->tempData2,
-        cudnn::dataType<Dtype>::zero,
-        top_desc_, top_data) );
-}
-
-template <typename Dtype>
-void CuDNNLCNLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  const Dtype* top_diff = top[0]->gpu_diff();
-  const Dtype* top_data = top[0]->gpu_data();
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-
-  CUDNN_CHECK(cudnnDivisiveNormalizationBackward(
-        handle_, norm_desc_, CUDNN_DIVNORM_PRECOMPUTED_MEANS,
-        cudnn::dataType<Dtype>::one,
-        bottom_desc_, bottom_data,
-        NULL, top_diff,  // NULL - srcMeansData
-        this->tempData1, this->tempData2,
-        cudnn::dataType<Dtype>::zero,
-        bottom_desc_, bottom_diff,
-        NULL) );
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(CuDNNLCNLayer);
-
-}  // namespace caffe
-#endif
diff --git a/src/caffe/layers/cudnn_lrn_layer.cpp b/src/caffe/layers/cudnn_lrn_layer.cpp
deleted file mode 100644
index 0495b80..0000000
--- a/src/caffe/layers/cudnn_lrn_layer.cpp
+++ /dev/null
@@ -1,53 +0,0 @@
-#ifdef USE_CUDNN
-#include <vector>
-
-#include "caffe/layers/cudnn_lrn_layer.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void CuDNNLRNLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  LRNLayer<Dtype>::LayerSetUp(bottom, top);
-
-  CUDNN_CHECK(cudnnCreate(&handle_));
-  CUDNN_CHECK(cudnnCreateLRNDescriptor(&norm_desc_));
-  cudnn::createTensor4dDesc<Dtype>(&bottom_desc_);
-  cudnn::createTensor4dDesc<Dtype>(&top_desc_);
-
-  // create a LRN handle
-  handles_setup_ = true;
-
-  size_ = this->layer_param().lrn_param().local_size();
-  alpha_ = this->layer_param().lrn_param().alpha();
-  beta_ = this->layer_param().lrn_param().beta();
-  k_ = this->layer_param().lrn_param().k();
-}
-
-template <typename Dtype>
-void CuDNNLRNLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  LRNLayer<Dtype>::Reshape(bottom, top);
-  cudnn::setTensor4dDesc<Dtype>(&bottom_desc_, bottom[0]->num(),
-      this->channels_, this->height_, this->width_);
-  cudnn::setTensor4dDesc<Dtype>(&top_desc_, bottom[0]->num(),
-      this->channels_, this->height_, this->width_);
-  CUDNN_CHECK(cudnnSetLRNDescriptor(norm_desc_, size_, alpha_, beta_, k_));
-}
-
-template <typename Dtype>
-CuDNNLRNLayer<Dtype>::~CuDNNLRNLayer() {
-  // Check that handles have been setup before destroying.
-  if (!handles_setup_) { return; }
-
-  cudnnDestroyTensorDescriptor(bottom_desc_);
-  cudnnDestroyTensorDescriptor(top_desc_);
-
-  // destroy LRN handle
-  cudnnDestroy(handle_);
-}
-
-INSTANTIATE_CLASS(CuDNNLRNLayer);
-
-}   // namespace caffe
-#endif
diff --git a/src/caffe/layers/cudnn_lrn_layer.cu b/src/caffe/layers/cudnn_lrn_layer.cu
deleted file mode 100644
index ca647f3..0000000
--- a/src/caffe/layers/cudnn_lrn_layer.cu
+++ /dev/null
@@ -1,44 +0,0 @@
-#ifdef USE_CUDNN
-#include <vector>
-
-#include "caffe/layers/cudnn_lrn_layer.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void CuDNNLRNLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-
-  CUDNN_CHECK(cudnnLRNCrossChannelForward(
-        handle_, norm_desc_, CUDNN_LRN_CROSS_CHANNEL_DIM1,
-        cudnn::dataType<Dtype>::one,
-        bottom_desc_, bottom_data,
-        cudnn::dataType<Dtype>::zero,
-        top_desc_, top_data) );
-}
-
-template <typename Dtype>
-void CuDNNLRNLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  const Dtype* top_diff = top[0]->gpu_diff();
-  const Dtype* top_data = top[0]->gpu_data();
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-
-  CUDNN_CHECK(cudnnLRNCrossChannelBackward(
-        handle_, norm_desc_, CUDNN_LRN_CROSS_CHANNEL_DIM1,
-        cudnn::dataType<Dtype>::one,
-        top_desc_, top_data,
-        top_desc_, top_diff,
-        bottom_desc_, bottom_data,
-        cudnn::dataType<Dtype>::zero,
-        bottom_desc_, bottom_diff) );
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(CuDNNLRNLayer);
-
-};  // namespace caffe
-
-#endif
diff --git a/src/caffe/layers/cudnn_pooling_layer.cpp b/src/caffe/layers/cudnn_pooling_layer.cpp
deleted file mode 100644
index 24f1478..0000000
--- a/src/caffe/layers/cudnn_pooling_layer.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-#ifdef USE_CUDNN
-#include <vector>
-
-#include "caffe/layers/cudnn_pooling_layer.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void CuDNNPoolingLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  PoolingLayer<Dtype>::LayerSetUp(bottom, top);
-  CUDNN_CHECK(cudnnCreate(&handle_));
-  cudnn::createTensor4dDesc<Dtype>(&bottom_desc_);
-  cudnn::createTensor4dDesc<Dtype>(&top_desc_);
-  cudnn::createPoolingDesc<Dtype>(&pooling_desc_,
-      this->layer_param_.pooling_param().pool(), &mode_,
-      this->kernel_h_, this->kernel_w_, this->pad_h_, this->pad_w_,
-      this->stride_h_, this->stride_w_);
-  handles_setup_ = true;
-}
-
-template <typename Dtype>
-void CuDNNPoolingLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  PoolingLayer<Dtype>::Reshape(bottom, top);
-  cudnn::setTensor4dDesc<Dtype>(&bottom_desc_, bottom[0]->num(),
-      this->channels_, this->height_, this->width_);
-  cudnn::setTensor4dDesc<Dtype>(&top_desc_, bottom[0]->num(),
-      this->channels_, this->pooled_height_, this->pooled_width_);
-}
-
-template <typename Dtype>
-CuDNNPoolingLayer<Dtype>::~CuDNNPoolingLayer() {
-  // Check that handles have been setup before destroying.
-  if (!handles_setup_) { return; }
-
-  cudnnDestroyTensorDescriptor(bottom_desc_);
-  cudnnDestroyTensorDescriptor(top_desc_);
-  cudnnDestroyPoolingDescriptor(pooling_desc_);
-  cudnnDestroy(handle_);
-}
-
-INSTANTIATE_CLASS(CuDNNPoolingLayer);
-
-}   // namespace caffe
-#endif
diff --git a/src/caffe/layers/cudnn_pooling_layer.cu b/src/caffe/layers/cudnn_pooling_layer.cu
deleted file mode 100644
index 6f00195..0000000
--- a/src/caffe/layers/cudnn_pooling_layer.cu
+++ /dev/null
@@ -1,41 +0,0 @@
-#ifdef USE_CUDNN
-#include <vector>
-
-#include "caffe/layers/cudnn_pooling_layer.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void CuDNNPoolingLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  CUDNN_CHECK(cudnnPoolingForward(handle_, pooling_desc_,
-        cudnn::dataType<Dtype>::one,
-        bottom_desc_, bottom_data,
-        cudnn::dataType<Dtype>::zero,
-        top_desc_, top_data));
-}
-
-template <typename Dtype>
-void CuDNNPoolingLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  if (!propagate_down[0]) {
-    return;
-  }
-  const Dtype* top_diff = top[0]->gpu_diff();
-  const Dtype* top_data = top[0]->gpu_data();
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-  CUDNN_CHECK(cudnnPoolingBackward(handle_, pooling_desc_,
-        cudnn::dataType<Dtype>::one,
-        top_desc_, top_data, top_desc_, top_diff,
-        bottom_desc_, bottom_data,
-        cudnn::dataType<Dtype>::zero,
-        bottom_desc_, bottom_diff));
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(CuDNNPoolingLayer);
-
-}  // namespace caffe
-#endif
diff --git a/src/caffe/layers/cudnn_relu_layer.cpp b/src/caffe/layers/cudnn_relu_layer.cpp
deleted file mode 100644
index 795e0a9..0000000
--- a/src/caffe/layers/cudnn_relu_layer.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-#ifdef USE_CUDNN
-#include <vector>
-
-#include "caffe/layers/cudnn_relu_layer.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void CuDNNReLULayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  ReLULayer<Dtype>::LayerSetUp(bottom, top);
-  // initialize cuDNN
-  CUDNN_CHECK(cudnnCreate(&handle_));
-  cudnn::createTensor4dDesc<Dtype>(&bottom_desc_);
-  cudnn::createTensor4dDesc<Dtype>(&top_desc_);
-  cudnn::createActivationDescriptor<Dtype>(&activ_desc_, CUDNN_ACTIVATION_RELU);
-  handles_setup_ = true;
-}
-
-template <typename Dtype>
-void CuDNNReLULayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  ReLULayer<Dtype>::Reshape(bottom, top);
-  const int N = bottom[0]->num();
-  const int K = bottom[0]->channels();
-  const int H = bottom[0]->height();
-  const int W = bottom[0]->width();
-  cudnn::setTensor4dDesc<Dtype>(&bottom_desc_, N, K, H, W);
-  cudnn::setTensor4dDesc<Dtype>(&top_desc_, N, K, H, W);
-}
-
-template <typename Dtype>
-CuDNNReLULayer<Dtype>::~CuDNNReLULayer() {
-  // Check that handles have been setup before destroying.
-  if (!handles_setup_) { return; }
-
-  cudnnDestroyTensorDescriptor(this->bottom_desc_);
-  cudnnDestroyTensorDescriptor(this->top_desc_);
-  cudnnDestroy(this->handle_);
-}
-
-INSTANTIATE_CLASS(CuDNNReLULayer);
-
-}  // namespace caffe
-#endif
diff --git a/src/caffe/layers/cudnn_relu_layer.cu b/src/caffe/layers/cudnn_relu_layer.cu
deleted file mode 100644
index e7928bb..0000000
--- a/src/caffe/layers/cudnn_relu_layer.cu
+++ /dev/null
@@ -1,74 +0,0 @@
-#ifdef USE_CUDNN
-#include <vector>
-
-#include "caffe/layers/cudnn_relu_layer.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void CuDNNReLULayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  // Fallback to standard Caffe for leaky ReLU.
-  if (ReLULayer<Dtype>::layer_param_.relu_param().negative_slope() != 0) {
-    return ReLULayer<Dtype>::Forward_gpu(bottom, top);
-  }
-
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-#if CUDNN_VERSION_MIN(5, 0, 0)
-  CUDNN_CHECK(cudnnActivationForward(this->handle_,
-        activ_desc_,
-        cudnn::dataType<Dtype>::one,
-        this->bottom_desc_, bottom_data,
-        cudnn::dataType<Dtype>::zero,
-        this->top_desc_, top_data));
-#else
-  CUDNN_CHECK(cudnnActivationForward_v4(this->handle_,
-        activ_desc_,
-        cudnn::dataType<Dtype>::one,
-        this->bottom_desc_, bottom_data,
-        cudnn::dataType<Dtype>::zero,
-        this->top_desc_, top_data));
-#endif
-}
-
-template <typename Dtype>
-void CuDNNReLULayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
-  if (!propagate_down[0]) {
-    return;
-  }
-
-  // Fallback to standard Caffe for leaky ReLU.
-  if (ReLULayer<Dtype>::layer_param_.relu_param().negative_slope() != 0) {
-    return ReLULayer<Dtype>::Backward_gpu(top, propagate_down, bottom);
-  }
-
-  const Dtype* top_data = top[0]->gpu_data();
-  const Dtype* top_diff = top[0]->gpu_diff();
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-#if CUDNN_VERSION_MIN(5, 0, 0)
-  CUDNN_CHECK(cudnnActivationBackward(this->handle_,
-        activ_desc_,
-        cudnn::dataType<Dtype>::one,
-        this->top_desc_, top_data, this->top_desc_, top_diff,
-        this->bottom_desc_, bottom_data,
-        cudnn::dataType<Dtype>::zero,
-        this->bottom_desc_, bottom_diff));
-#else
-  CUDNN_CHECK(cudnnActivationBackward_v4(this->handle_,
-        activ_desc_,
-        cudnn::dataType<Dtype>::one,
-        this->top_desc_, top_data, this->top_desc_, top_diff,
-        this->bottom_desc_, bottom_data,
-        cudnn::dataType<Dtype>::zero,
-        this->bottom_desc_, bottom_diff));
-#endif
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(CuDNNReLULayer);
-
-}  // namespace caffe
-#endif
diff --git a/src/caffe/layers/cudnn_sigmoid_layer.cpp b/src/caffe/layers/cudnn_sigmoid_layer.cpp
deleted file mode 100644
index 3ce6aef..0000000
--- a/src/caffe/layers/cudnn_sigmoid_layer.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-#ifdef USE_CUDNN
-#include <vector>
-
-#include "caffe/layers/cudnn_sigmoid_layer.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void CuDNNSigmoidLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  SigmoidLayer<Dtype>::LayerSetUp(bottom, top);
-  // initialize cuDNN
-  CUDNN_CHECK(cudnnCreate(&handle_));
-  cudnn::createTensor4dDesc<Dtype>(&bottom_desc_);
-  cudnn::createTensor4dDesc<Dtype>(&top_desc_);
-  cudnn::createActivationDescriptor<Dtype>(&activ_desc_,
-      CUDNN_ACTIVATION_SIGMOID);
-  handles_setup_ = true;
-}
-
-template <typename Dtype>
-void CuDNNSigmoidLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  SigmoidLayer<Dtype>::Reshape(bottom, top);
-  const int N = bottom[0]->num();
-  const int K = bottom[0]->channels();
-  const int H = bottom[0]->height();
-  const int W = bottom[0]->width();
-  cudnn::setTensor4dDesc<Dtype>(&bottom_desc_, N, K, H, W);
-  cudnn::setTensor4dDesc<Dtype>(&top_desc_, N, K, H, W);
-}
-
-template <typename Dtype>
-CuDNNSigmoidLayer<Dtype>::~CuDNNSigmoidLayer() {
-  // Check that handles have been setup before destroying.
-  if (!handles_setup_) { return; }
-
-  cudnnDestroyTensorDescriptor(this->bottom_desc_);
-  cudnnDestroyTensorDescriptor(this->top_desc_);
-  cudnnDestroy(this->handle_);
-}
-
-INSTANTIATE_CLASS(CuDNNSigmoidLayer);
-
-}  // namespace caffe
-#endif
diff --git a/src/caffe/layers/cudnn_sigmoid_layer.cu b/src/caffe/layers/cudnn_sigmoid_layer.cu
deleted file mode 100644
index 48d6cba..0000000
--- a/src/caffe/layers/cudnn_sigmoid_layer.cu
+++ /dev/null
@@ -1,64 +0,0 @@
-#ifdef USE_CUDNN
-#include <vector>
-
-#include "caffe/layers/cudnn_sigmoid_layer.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void CuDNNSigmoidLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-#if CUDNN_VERSION_MIN(5, 0, 0)
-  CUDNN_CHECK(cudnnActivationForward(this->handle_,
-        activ_desc_,
-        cudnn::dataType<Dtype>::one,
-        this->bottom_desc_, bottom_data,
-        cudnn::dataType<Dtype>::zero,
-        this->top_desc_, top_data));
-#else
-  CUDNN_CHECK(cudnnActivationForward_v4(this->handle_,
-        activ_desc_,
-        cudnn::dataType<Dtype>::one,
-        this->bottom_desc_, bottom_data,
-        cudnn::dataType<Dtype>::zero,
-        this->top_desc_, top_data));
-#endif
-}
-
-template <typename Dtype>
-void CuDNNSigmoidLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
-  if (!propagate_down[0]) {
-    return;
-  }
-
-  const Dtype* top_data = top[0]->gpu_data();
-  const Dtype* top_diff = top[0]->gpu_diff();
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-#if CUDNN_VERSION_MIN(5, 0, 0)
-  CUDNN_CHECK(cudnnActivationBackward(this->handle_,
-        activ_desc_,
-        cudnn::dataType<Dtype>::one,
-        this->top_desc_, top_data, this->top_desc_, top_diff,
-        this->bottom_desc_, bottom_data,
-        cudnn::dataType<Dtype>::zero,
-        this->bottom_desc_, bottom_diff));
-#else
-  CUDNN_CHECK(cudnnActivationBackward_v4(this->handle_,
-        activ_desc_,
-        cudnn::dataType<Dtype>::one,
-        this->top_desc_, top_data, this->top_desc_, top_diff,
-        this->bottom_desc_, bottom_data,
-        cudnn::dataType<Dtype>::zero,
-        this->bottom_desc_, bottom_diff));
-#endif
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(CuDNNSigmoidLayer);
-
-}  // namespace caffe
-#endif
diff --git a/src/caffe/layers/cudnn_softmax_layer.cpp b/src/caffe/layers/cudnn_softmax_layer.cpp
deleted file mode 100644
index 6440df9..0000000
--- a/src/caffe/layers/cudnn_softmax_layer.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-#ifdef USE_CUDNN
-#include <vector>
-
-#include "thrust/device_vector.h"
-
-#include "caffe/layers/cudnn_softmax_layer.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void CuDNNSoftmaxLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  SoftmaxLayer<Dtype>::LayerSetUp(bottom, top);
-  // Initialize CUDNN.
-  CUDNN_CHECK(cudnnCreate(&handle_));
-  cudnn::createTensor4dDesc<Dtype>(&bottom_desc_);
-  cudnn::createTensor4dDesc<Dtype>(&top_desc_);
-  handles_setup_ = true;
-}
-
-template <typename Dtype>
-void CuDNNSoftmaxLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  SoftmaxLayer<Dtype>::Reshape(bottom, top);
-  int N = this->outer_num_;
-  int K = bottom[0]->shape(this->softmax_axis_);
-  int H = this->inner_num_;
-  int W = 1;
-  cudnn::setTensor4dDesc<Dtype>(&bottom_desc_, N, K, H, W);
-  cudnn::setTensor4dDesc<Dtype>(&top_desc_, N, K, H, W);
-}
-
-template <typename Dtype>
-CuDNNSoftmaxLayer<Dtype>::~CuDNNSoftmaxLayer() {
-  // Check that handles have been setup before destroying.
-  if (!handles_setup_) { return; }
-
-  cudnnDestroyTensorDescriptor(bottom_desc_);
-  cudnnDestroyTensorDescriptor(top_desc_);
-  cudnnDestroy(handle_);
-}
-
-INSTANTIATE_CLASS(CuDNNSoftmaxLayer);
-
-}  // namespace caffe
-#endif
diff --git a/src/caffe/layers/cudnn_softmax_layer.cu b/src/caffe/layers/cudnn_softmax_layer.cu
deleted file mode 100644
index 7283eb7..0000000
--- a/src/caffe/layers/cudnn_softmax_layer.cu
+++ /dev/null
@@ -1,44 +0,0 @@
-#ifdef USE_CUDNN
-#include <vector>
-
-#include "thrust/device_vector.h"
-
-#include "caffe/layers/cudnn_softmax_layer.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void CuDNNSoftmaxLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  CUDNN_CHECK(cudnnSoftmaxForward(handle_, CUDNN_SOFTMAX_ACCURATE,
-        CUDNN_SOFTMAX_MODE_CHANNEL,
-        cudnn::dataType<Dtype>::one,
-        bottom_desc_, bottom_data,
-        cudnn::dataType<Dtype>::zero,
-        top_desc_, top_data));
-}
-
-template <typename Dtype>
-void CuDNNSoftmaxLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  if (propagate_down[0]) {
-    const Dtype* top_data = top[0]->gpu_data();
-    const Dtype* top_diff = top[0]->gpu_diff();
-    const Dtype* bottom_data = bottom[0]->gpu_data();
-    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-
-    CUDNN_CHECK(cudnnSoftmaxBackward(handle_, CUDNN_SOFTMAX_ACCURATE,
-          CUDNN_SOFTMAX_MODE_CHANNEL,
-          cudnn::dataType<Dtype>::one,
-          top_desc_, top_data, top_desc_, top_diff,
-          cudnn::dataType<Dtype>::zero,
-          bottom_desc_, bottom_diff));
-  }
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(CuDNNSoftmaxLayer);
-
-}  // namespace caffe
-#endif
diff --git a/src/caffe/layers/cudnn_tanh_layer.cpp b/src/caffe/layers/cudnn_tanh_layer.cpp
deleted file mode 100644
index e87dd9d..0000000
--- a/src/caffe/layers/cudnn_tanh_layer.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-#ifdef USE_CUDNN
-#include <vector>
-
-#include "caffe/layers/cudnn_tanh_layer.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void CuDNNTanHLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  TanHLayer<Dtype>::LayerSetUp(bottom, top);
-  // initialize cuDNN
-  CUDNN_CHECK(cudnnCreate(&handle_));
-  cudnn::createTensor4dDesc<Dtype>(&bottom_desc_);
-  cudnn::createTensor4dDesc<Dtype>(&top_desc_);
-  cudnn::createActivationDescriptor<Dtype>(&activ_desc_, CUDNN_ACTIVATION_TANH);
-  handles_setup_ = true;
-}
-
-template <typename Dtype>
-void CuDNNTanHLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  TanHLayer<Dtype>::Reshape(bottom, top);
-  const int N = bottom[0]->num();
-  const int K = bottom[0]->channels();
-  const int H = bottom[0]->height();
-  const int W = bottom[0]->width();
-  cudnn::setTensor4dDesc<Dtype>(&bottom_desc_, N, K, H, W);
-  cudnn::setTensor4dDesc<Dtype>(&top_desc_, N, K, H, W);
-}
-
-template <typename Dtype>
-CuDNNTanHLayer<Dtype>::~CuDNNTanHLayer() {
-  // Check that handles have been setup before destroying.
-  if (!handles_setup_) { return; }
-
-  cudnnDestroyTensorDescriptor(this->bottom_desc_);
-  cudnnDestroyTensorDescriptor(this->top_desc_);
-  cudnnDestroy(this->handle_);
-}
-
-INSTANTIATE_CLASS(CuDNNTanHLayer);
-
-}  // namespace caffe
-#endif
diff --git a/src/caffe/layers/cudnn_tanh_layer.cu b/src/caffe/layers/cudnn_tanh_layer.cu
deleted file mode 100644
index 6b5d7ae..0000000
--- a/src/caffe/layers/cudnn_tanh_layer.cu
+++ /dev/null
@@ -1,65 +0,0 @@
-#ifdef USE_CUDNN
-#include <vector>
-
-#include "caffe/layers/cudnn_tanh_layer.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void CuDNNTanHLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-#if CUDNN_VERSION_MIN(5, 0, 0)
-  CUDNN_CHECK(cudnnActivationForward(this->handle_,
-        activ_desc_,
-        cudnn::dataType<Dtype>::one,
-        this->bottom_desc_, bottom_data,
-        cudnn::dataType<Dtype>::zero,
-        this->top_desc_, top_data));
-#else
-  CUDNN_CHECK(cudnnActivationForward_v4(this->handle_,
-        activ_desc_,
-        cudnn::dataType<Dtype>::one,
-        this->bottom_desc_, bottom_data,
-        cudnn::dataType<Dtype>::zero,
-        this->top_desc_, top_data));
-#endif
-}
-
-template <typename Dtype>
-void CuDNNTanHLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
-  if (!propagate_down[0]) {
-    return;
-  }
-
-  const Dtype* top_data = top[0]->gpu_data();
-  const Dtype* top_diff = top[0]->gpu_diff();
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-
-#if CUDNN_VERSION_MIN(5, 0, 0)
-  CUDNN_CHECK(cudnnActivationBackward(this->handle_,
-        activ_desc_,
-        cudnn::dataType<Dtype>::one,
-        this->top_desc_, top_data, this->top_desc_, top_diff,
-        this->bottom_desc_, bottom_data,
-        cudnn::dataType<Dtype>::zero,
-        this->bottom_desc_, bottom_diff));
-#else
-  CUDNN_CHECK(cudnnActivationBackward_v4(this->handle_,
-        activ_desc_,
-        cudnn::dataType<Dtype>::one,
-        this->top_desc_, top_data, this->top_desc_, top_diff,
-        this->bottom_desc_, bottom_data,
-        cudnn::dataType<Dtype>::zero,
-        this->bottom_desc_, bottom_diff));
-#endif
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(CuDNNTanHLayer);
-
-}  // namespace caffe
-#endif
diff --git a/src/caffe/layers/data_layer.cpp b/src/caffe/layers/data_layer.cpp
deleted file mode 100644
index 66e6301..0000000
--- a/src/caffe/layers/data_layer.cpp
+++ /dev/null
@@ -1,110 +0,0 @@
-#ifdef USE_OPENCV
-#include <opencv2/core/core.hpp>
-#endif  // USE_OPENCV
-#include <stdint.h>
-
-#include <vector>
-
-#include "caffe/data_transformer.hpp"
-#include "caffe/layers/data_layer.hpp"
-#include "caffe/util/benchmark.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-DataLayer<Dtype>::DataLayer(const LayerParameter& param)
-  : BasePrefetchingDataLayer<Dtype>(param),
-    reader_(param) {
-}
-
-template <typename Dtype>
-DataLayer<Dtype>::~DataLayer() {
-  this->StopInternalThread();
-}
-
-template <typename Dtype>
-void DataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  const int batch_size = this->layer_param_.data_param().batch_size();
-  // Read a data point, and use it to initialize the top blob.
-  Datum& datum = *(reader_.full().peek());
-
-  // Use data_transformer to infer the expected blob shape from datum.
-  vector<int> top_shape = this->data_transformer_->InferBlobShape(datum);
-  this->transformed_data_.Reshape(top_shape);
-  // Reshape top[0] and prefetch_data according to the batch_size.
-  top_shape[0] = batch_size;
-  top[0]->Reshape(top_shape);
-  for (int i = 0; i < this->PREFETCH_COUNT; ++i) {
-    this->prefetch_[i].data_.Reshape(top_shape);
-  }
-  LOG(INFO) << "output data size: " << top[0]->num() << ","
-      << top[0]->channels() << "," << top[0]->height() << ","
-      << top[0]->width();
-  // label
-  if (this->output_labels_) {
-    vector<int> label_shape(1, batch_size);
-    top[1]->Reshape(label_shape);
-    for (int i = 0; i < this->PREFETCH_COUNT; ++i) {
-      this->prefetch_[i].label_.Reshape(label_shape);
-    }
-  }
-}
-
-// This function is called on prefetch thread
-template<typename Dtype>
-void DataLayer<Dtype>::load_batch(Batch<Dtype>* batch) {
-  CPUTimer batch_timer;
-  batch_timer.Start();
-  double read_time = 0;
-  double trans_time = 0;
-  CPUTimer timer;
-  CHECK(batch->data_.count());
-  CHECK(this->transformed_data_.count());
-
-  // Reshape according to the first datum of each batch
-  // on single input batches allows for inputs of varying dimension.
-  const int batch_size = this->layer_param_.data_param().batch_size();
-  Datum& datum = *(reader_.full().peek());
-  // Use data_transformer to infer the expected blob shape from datum.
-  vector<int> top_shape = this->data_transformer_->InferBlobShape(datum);
-  this->transformed_data_.Reshape(top_shape);
-  // Reshape batch according to the batch_size.
-  top_shape[0] = batch_size;
-  batch->data_.Reshape(top_shape);
-
-  Dtype* top_data = batch->data_.mutable_cpu_data();
-  Dtype* top_label = NULL;  // suppress warnings about uninitialized variables
-
-  if (this->output_labels_) {
-    top_label = batch->label_.mutable_cpu_data();
-  }
-  for (int item_id = 0; item_id < batch_size; ++item_id) {
-    timer.Start();
-    // get a datum
-    Datum& datum = *(reader_.full().pop("Waiting for data"));
-    read_time += timer.MicroSeconds();
-    timer.Start();
-    // Apply data transformations (mirror, scale, crop...)
-    int offset = batch->data_.offset(item_id);
-    this->transformed_data_.set_cpu_data(top_data + offset);
-    this->data_transformer_->Transform(datum, &(this->transformed_data_));
-    // Copy label.
-    if (this->output_labels_) {
-      top_label[item_id] = datum.label();
-    }
-    trans_time += timer.MicroSeconds();
-
-    reader_.free().push(const_cast<Datum*>(&datum));
-  }
-  timer.Stop();
-  batch_timer.Stop();
-  DLOG(INFO) << "Prefetch batch: " << batch_timer.MilliSeconds() << " ms.";
-  DLOG(INFO) << "     Read time: " << read_time / 1000 << " ms.";
-  DLOG(INFO) << "Transform time: " << trans_time / 1000 << " ms.";
-}
-
-INSTANTIATE_CLASS(DataLayer);
-REGISTER_LAYER_CLASS(Data);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/deconv_layer.cpp b/src/caffe/layers/deconv_layer.cpp
index 20a460f..6b9102b 100644
--- a/src/caffe/layers/deconv_layer.cpp
+++ b/src/caffe/layers/deconv_layer.cpp
@@ -74,10 +74,6 @@ void DeconvolutionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
-#ifdef CPU_ONLY
-STUB_GPU(DeconvolutionLayer);
-#endif
-
 INSTANTIATE_CLASS(DeconvolutionLayer);
 REGISTER_LAYER_CLASS(Deconvolution);
 
diff --git a/src/caffe/layers/deconv_layer.cu b/src/caffe/layers/deconv_layer.cu
deleted file mode 100644
index 2267632..0000000
--- a/src/caffe/layers/deconv_layer.cu
+++ /dev/null
@@ -1,61 +0,0 @@
-#include <vector>
-
-#include "caffe/layers/deconv_layer.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void DeconvolutionLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  const Dtype* weight = this->blobs_[0]->gpu_data();
-  for (int i = 0; i < bottom.size(); ++i) {
-    const Dtype* bottom_data = bottom[i]->gpu_data();
-    Dtype* top_data = top[i]->mutable_gpu_data();
-    for (int n = 0; n < this->num_; ++n) {
-      this->backward_gpu_gemm(bottom_data + n * this->bottom_dim_, weight,
-          top_data + n * this->top_dim_);
-      if (this->bias_term_) {
-        const Dtype* bias = this->blobs_[1]->gpu_data();
-        this->forward_gpu_bias(top_data + n * this->top_dim_, bias);
-      }
-    }
-  }
-}
-
-template <typename Dtype>
-void DeconvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  const Dtype* weight = this->blobs_[0]->gpu_data();
-  Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff();
-  for (int i = 0; i < top.size(); ++i) {
-    const Dtype* top_diff = top[i]->gpu_diff();
-    const Dtype* bottom_data = bottom[i]->gpu_data();
-    Dtype* bottom_diff = bottom[i]->mutable_gpu_diff();
-    // Bias gradient, if necessary.
-    if (this->bias_term_ && this->param_propagate_down_[1]) {
-      Dtype* bias_diff = this->blobs_[1]->mutable_gpu_diff();
-      for (int n = 0; n < this->num_; ++n) {
-        this->backward_gpu_bias(bias_diff, top_diff + n * this->top_dim_);
-      }
-    }
-    if (this->param_propagate_down_[0] || propagate_down[i]) {
-      for (int n = 0; n < this->num_; ++n) {
-        // gradient w.r.t. weight. Note that we will accumulate diffs.
-        if (this->param_propagate_down_[0]) {
-          this->weight_gpu_gemm(top_diff + n * this->top_dim_,
-              bottom_data + n * this->bottom_dim_, weight_diff);
-        }
-        // gradient w.r.t. bottom data, if necessary.
-        if (propagate_down[i]) {
-          this->forward_gpu_gemm(top_diff + n * this->top_dim_, weight,
-              bottom_diff + n * this->bottom_dim_,
-              this->param_propagate_down_[0]);
-        }
-      }
-    }
-  }
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(DeconvolutionLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/dropout_layer.cpp b/src/caffe/layers/dropout_layer.cpp
index 533ab26..9898cec 100644
--- a/src/caffe/layers/dropout_layer.cpp
+++ b/src/caffe/layers/dropout_layer.cpp
@@ -64,11 +64,6 @@ void DropoutLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
-
-#ifdef CPU_ONLY
-STUB_GPU(DropoutLayer);
-#endif
-
 INSTANTIATE_CLASS(DropoutLayer);
 REGISTER_LAYER_CLASS(Dropout);
 
diff --git a/src/caffe/layers/dropout_layer.cu b/src/caffe/layers/dropout_layer.cu
deleted file mode 100644
index 186c10c..0000000
--- a/src/caffe/layers/dropout_layer.cu
+++ /dev/null
@@ -1,70 +0,0 @@
-#include <vector>
-
-#include "caffe/layers/dropout_layer.hpp"
-#include "caffe/util/math_functions.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-__global__ void DropoutForward(const int n, const Dtype* in,
-    const unsigned int* mask, const unsigned int threshold, const float scale,
-    Dtype* out) {
-  CUDA_KERNEL_LOOP(index, n) {
-    out[index] = in[index] * (mask[index] > threshold) * scale;
-  }
-}
-
-template <typename Dtype>
-void DropoutLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  const int count = bottom[0]->count();
-  if (this->phase_ == TRAIN) {
-    unsigned int* mask =
-        static_cast<unsigned int*>(rand_vec_.mutable_gpu_data());
-    caffe_gpu_rng_uniform(count, mask);
-    // set thresholds
-    // NOLINT_NEXT_LINE(whitespace/operators)
-    DropoutForward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-        count, bottom_data, mask, uint_thres_, scale_, top_data);
-    CUDA_POST_KERNEL_CHECK;
-  } else {
-    caffe_copy(count, bottom_data, top_data);
-  }
-}
-
-template <typename Dtype>
-__global__ void DropoutBackward(const int n, const Dtype* in_diff,
-    const unsigned int* mask, const unsigned int threshold, const float scale,
-    Dtype* out_diff) {
-  CUDA_KERNEL_LOOP(index, n) {
-    out_diff[index] = in_diff[index] * scale * (mask[index] > threshold);
-  }
-}
-
-template <typename Dtype>
-void DropoutLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
-  if (propagate_down[0]) {
-    const Dtype* top_diff = top[0]->gpu_diff();
-    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-    if (this->phase_ == TRAIN) {
-      const unsigned int* mask =
-          static_cast<const unsigned int*>(rand_vec_.gpu_data());
-      const int count = bottom[0]->count();
-      // NOLINT_NEXT_LINE(whitespace/operators)
-      DropoutBackward<Dtype><<<CAFFE_GET_BLOCKS(count),
-        CAFFE_CUDA_NUM_THREADS>>>(
-          count, top_diff, mask, uint_thres_, scale_, bottom_diff);
-      CUDA_POST_KERNEL_CHECK;
-    } else {
-      caffe_copy(top[0]->count(), top_diff, bottom_diff);
-    }
-  }
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(DropoutLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/dummy_data_layer.cpp b/src/caffe/layers/dummy_data_layer.cpp
deleted file mode 100644
index e382bfe..0000000
--- a/src/caffe/layers/dummy_data_layer.cpp
+++ /dev/null
@@ -1,114 +0,0 @@
-#include <vector>
-
-#include "caffe/filler.hpp"
-#include "caffe/layers/dummy_data_layer.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void DummyDataLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  const int num_top = top.size();
-  const DummyDataParameter& param = this->layer_param_.dummy_data_param();
-  const int num_data_filler = param.data_filler_size();
-  CHECK(num_data_filler == 0 || num_data_filler == 1 ||
-        num_data_filler == num_top)
-      << "Number of data fillers must be 0, 1 or equal to the number of tops: "
-      << num_top << "; you specified " << num_data_filler << " data fillers.";
-
-  const bool legacy_dims = param.num_size() || param.channels_size() ||
-                           param.height_size() || param.width_size();
-  if (legacy_dims) {
-    CHECK_EQ(0, param.shape_size())
-        << "Both shape and legacy fields were specified";
-    // Using deprecated 4D output dim specifiers.
-    CHECK(param.num_size() == 1 || param.num_size() == num_top)
-        << "Must specify 'num' once, or once per top blob "
-        << "(" << num_top << "); specified " << param.num_size() << ".";
-    CHECK(param.channels_size() == 1 || param.channels_size() == num_top)
-        << "Must specify 'channels' once, or once per top blob "
-        << "(" << num_top << "); specified " << param.channels_size() << ".";
-    CHECK(param.height_size() == 1 || param.height_size() == num_top)
-        << "Must specify 'height' once, or once per top blob "
-        << "(" << num_top << "); specified " << param.height_size() << ".";
-    CHECK(param.width_size() == 1 || param.width_size() == num_top)
-        << "Must specify 'width' once, or once per top blob "
-        << "(" << num_top << "); specified " << param.width_size() << ".";
-  } else {
-    CHECK(param.shape_size() == 1 || param.shape_size() == num_top)
-        << "Must specify 'shape' once, or once per top blob "
-        << "(" << num_top << "); specified " << param.shape_size() << ".";
-  }
-  // refill_[i] tells Forward i whether or not to actually refill top Blob i.
-  // If refill_[i] is false, Forward does nothing for Blob i. We use this to
-  // avoid wastefully refilling "constant" Blobs in every forward pass.
-  // We first fill refill_ in with the INVERSE of its final values.
-  // The first time we run Forward from the LayerSetUp method, we'll fill only
-  // Blobs for which refill_ is normally false.  These Blobs will never be
-  // filled again.
-  refill_.clear();
-  fillers_.clear();
-  if (num_data_filler <= 1) {
-    FillerParameter filler_param;
-    if (num_data_filler == 0) {
-      filler_param.set_type("constant");
-      filler_param.set_value(0);
-    } else {
-      filler_param.CopyFrom(param.data_filler(0));
-    }
-    // Refill on each iteration iff not using a constant filler,
-    // but use the inverse of this rule for the first run.
-    refill_.resize(1);
-    refill_[0] = (strcmp(filler_param.type().c_str(), "constant") == 0);
-    fillers_.resize(1);
-    fillers_[0].reset(GetFiller<Dtype>(filler_param));
-  } else {
-    refill_.resize(num_top);
-    fillers_.resize(num_top);
-    for (int i = 0; i < num_top; ++i) {
-      fillers_[i].reset(GetFiller<Dtype>(param.data_filler(i)));
-      // Refill on each iteration iff not using a constant filler,
-      // but use the inverse of this rule for the first run.
-      refill_[i] =
-          (strcmp(param.data_filler(i).type().c_str(), "constant") == 0);
-    }
-  }
-  for (int i = 0; i < num_top; ++i) {
-    if (legacy_dims) {
-      const int num = (param.num_size() == 1) ? param.num(0) : param.num(i);
-      const int channels =
-          (param.channels_size() == 1) ? param.channels(0) : param.channels(i);
-      const int height =
-          (param.height_size() == 1) ? param.height(0) : param.height(i);
-      const int width =
-          (param.width_size() == 1) ? param.width(0) : param.width(i);
-      top[i]->Reshape(num, channels, height, width);
-    } else {
-      const int shape_index = (param.shape_size() == 1) ? 0 : i;
-      top[i]->Reshape(param.shape(shape_index));
-    }
-  }
-  // Run Forward once, with refill_ inverted, to fill the constant Blobs.
-  this->Forward(bottom, top);
-  // Invert the inverted refill_ values to refill the desired (non-constant)
-  // Blobs in every usual forward pass.
-  for (int i = 0; i < refill_.size(); ++i) {
-    refill_[i] = !refill_[i];
-  }
-}
-
-template <typename Dtype>
-void DummyDataLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  for (int i = 0; i < top.size(); ++i) {
-    const int filler_id = (fillers_.size() > 1) ? i : 0;
-    if (refill_[filler_id]) {
-      fillers_[filler_id]->Fill(top[i]);
-    }
-  }
-}
-
-INSTANTIATE_CLASS(DummyDataLayer);
-REGISTER_LAYER_CLASS(DummyData);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/eltwise_layer.cpp b/src/caffe/layers/eltwise_layer.cpp
index 2125616..0124ecc 100644
--- a/src/caffe/layers/eltwise_layer.cpp
+++ b/src/caffe/layers/eltwise_layer.cpp
@@ -150,10 +150,6 @@ void EltwiseLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
-#ifdef CPU_ONLY
-STUB_GPU(EltwiseLayer);
-#endif
-
 INSTANTIATE_CLASS(EltwiseLayer);
 REGISTER_LAYER_CLASS(Eltwise);
 
diff --git a/src/caffe/layers/eltwise_layer.cu b/src/caffe/layers/eltwise_layer.cu
deleted file mode 100644
index c142852..0000000
--- a/src/caffe/layers/eltwise_layer.cu
+++ /dev/null
@@ -1,134 +0,0 @@
-#include <cfloat>
-#include <vector>
-
-#include "caffe/layers/eltwise_layer.hpp"
-#include "caffe/util/math_functions.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-__global__ void MaxForward(const int nthreads, const Dtype* bottom_data_a,
-    const Dtype* bottom_data_b, const int blob_idx, Dtype* top_data,
-    int* mask) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    Dtype maxval = -FLT_MAX;
-    int maxidx = -1;
-    if (bottom_data_a[index] > bottom_data_b[index]) {
-      // only update for very first bottom_data blob (blob_idx == 0)
-      if (blob_idx == 0) {
-        maxval = bottom_data_a[index];
-        top_data[index] = maxval;
-        maxidx = blob_idx;
-        mask[index] = maxidx;
-      }
-    } else {
-      maxval = bottom_data_b[index];
-      top_data[index] = maxval;
-      maxidx = blob_idx + 1;
-      mask[index] = maxidx;
-    }
-  }
-}
-
-template <typename Dtype>
-void EltwiseLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  int* mask = NULL;
-  const int count = top[0]->count();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  switch (op_) {
-  case EltwiseParameter_EltwiseOp_PROD:
-    caffe_gpu_mul(count, bottom[0]->gpu_data(), bottom[1]->gpu_data(),
-        top_data);
-    for (int i = 2; i < bottom.size(); ++i) {
-      caffe_gpu_mul(count, top_data, bottom[i]->gpu_data(), top_data);
-    }
-    break;
-  case EltwiseParameter_EltwiseOp_SUM:
-    caffe_gpu_set(count, Dtype(0.), top_data);
-    // TODO(shelhamer) does cuBLAS optimize to sum for coeff = 1?
-    for (int i = 0; i < bottom.size(); ++i) {
-      caffe_gpu_axpy(count, coeffs_[i], bottom[i]->gpu_data(), top_data);
-    }
-    break;
-  case EltwiseParameter_EltwiseOp_MAX:
-    mask = max_idx_.mutable_gpu_data();
-    // NOLINT_NEXT_LINE(whitespace/operators)
-    MaxForward<Dtype> <<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-        count, bottom[0]->gpu_data(), bottom[1]->gpu_data(), 0, top_data, mask);
-    for (int i = 2; i < bottom.size(); ++i) {
-      // NOLINT_NEXT_LINE(whitespace/operators)
-      MaxForward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-          count, top_data, bottom[i]->gpu_data(), i-1, top_data, mask);
-    }
-    break;
-  default:
-    LOG(FATAL) << "Unknown elementwise operation.";
-  }
-}
-
-template <typename Dtype>
-__global__ void MaxBackward(const int nthreads, const Dtype* top_diff,
-    const int blob_idx, const int* mask, Dtype* bottom_diff) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    Dtype gradient = 0;
-    if (mask[index] == blob_idx) {
-      gradient += top_diff[index];
-    }
-    bottom_diff[index] = gradient;
-  }
-}
-
-template <typename Dtype>
-void EltwiseLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  const int* mask = NULL;
-  const int count = top[0]->count();
-  const Dtype* top_data = top[0]->gpu_data();
-  const Dtype* top_diff = top[0]->gpu_diff();
-  for (int i = 0; i < bottom.size(); ++i) {
-    if (propagate_down[i]) {
-      const Dtype* bottom_data = bottom[i]->gpu_data();
-      Dtype* bottom_diff = bottom[i]->mutable_gpu_diff();
-      switch (op_) {
-      case EltwiseParameter_EltwiseOp_PROD:
-        if (stable_prod_grad_) {
-          bool initialized = false;
-          for (int j = 0; j < bottom.size(); ++j) {
-            if (i == j) { continue; }
-            if (!initialized) {
-              caffe_copy(count, bottom[j]->gpu_data(), bottom_diff);
-              initialized = true;
-            } else {
-              caffe_gpu_mul(count, bottom[j]->gpu_data(), bottom_diff,
-                            bottom_diff);
-            }
-          }
-        } else {
-          caffe_gpu_div(count, top_data, bottom_data, bottom_diff);
-        }
-        caffe_gpu_mul(count, bottom_diff, top_diff, bottom_diff);
-        break;
-      case EltwiseParameter_EltwiseOp_SUM:
-        if (coeffs_[i] == Dtype(1.)) {
-          caffe_copy(count, top_diff, bottom_diff);
-        } else {
-          caffe_gpu_scale(count, coeffs_[i], top_diff, bottom_diff);
-        }
-        break;
-      case EltwiseParameter_EltwiseOp_MAX:
-        mask = max_idx_.gpu_data();
-        MaxBackward<Dtype>  // NOLINT_NEXT_LINE(whitespace/operators)
-            <<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-            count, top_diff, i, mask, bottom_diff);
-        break;
-      default:
-        LOG(FATAL) << "Unknown elementwise operation.";
-      }
-    }
-  }
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(EltwiseLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/elu_layer.cpp b/src/caffe/layers/elu_layer.cpp
index a0f8763..5fdc0de 100644
--- a/src/caffe/layers/elu_layer.cpp
+++ b/src/caffe/layers/elu_layer.cpp
@@ -36,11 +36,6 @@ void ELULayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
-
-#ifdef CPU_ONLY
-STUB_GPU(ELULayer);
-#endif
-
 INSTANTIATE_CLASS(ELULayer);
 REGISTER_LAYER_CLASS(ELU);
 
diff --git a/src/caffe/layers/elu_layer.cu b/src/caffe/layers/elu_layer.cu
deleted file mode 100644
index 12545aa..0000000
--- a/src/caffe/layers/elu_layer.cu
+++ /dev/null
@@ -1,62 +0,0 @@
-#include <algorithm>
-#include <vector>
-
-#include "caffe/layers/elu_layer.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-__global__ void ELUForward(const int n, const Dtype* in, Dtype* out,
-    Dtype alpha) {
-  CUDA_KERNEL_LOOP(index, n) {
-    out[index] = in[index] > 0 ? in[index] :
-        alpha * (exp(in[index]) - 1);
-  }
-}
-
-template <typename Dtype>
-void ELULayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  const int count = bottom[0]->count();
-  Dtype alpha = this->layer_param_.elu_param().alpha();
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  ELUForward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-      count, bottom_data, top_data, alpha);
-  CUDA_POST_KERNEL_CHECK;
-}
-
-template <typename Dtype>
-__global__ void ELUBackward(const int n, const Dtype* in_diff,
-    const Dtype* out_data, const Dtype* in_data,
-    Dtype* out_diff, Dtype alpha) {
-  CUDA_KERNEL_LOOP(index, n) {
-    out_diff[index] = in_data[index] > 0 ? in_diff[index] :
-        in_diff[index] * (out_data[index] + alpha);
-  }
-}
-
-template <typename Dtype>
-void ELULayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
-  if (propagate_down[0]) {
-    const Dtype* bottom_data = bottom[0]->gpu_data();
-    const Dtype* top_diff = top[0]->gpu_diff();
-    const Dtype* top_data = top[0]->gpu_data();
-    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-    const int count = bottom[0]->count();
-    Dtype alpha = this->layer_param_.elu_param().alpha();
-    // NOLINT_NEXT_LINE(whitespace/operators)
-    ELUBackward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-        count, top_diff, top_data, bottom_data, bottom_diff, alpha);
-    CUDA_POST_KERNEL_CHECK;
-  }
-}
-
-
-INSTANTIATE_LAYER_GPU_FUNCS(ELULayer);
-
-
-}  // namespace caffe
diff --git a/src/caffe/layers/embed_layer.cpp b/src/caffe/layers/embed_layer.cpp
index 36b40d7..1665b0d 100644
--- a/src/caffe/layers/embed_layer.cpp
+++ b/src/caffe/layers/embed_layer.cpp
@@ -109,10 +109,6 @@ void EmbedLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
-#ifdef CPU_ONLY
-STUB_GPU(EmbedLayer);
-#endif
-
 INSTANTIATE_CLASS(EmbedLayer);
 REGISTER_LAYER_CLASS(Embed);
 
diff --git a/src/caffe/layers/embed_layer.cu b/src/caffe/layers/embed_layer.cu
deleted file mode 100644
index 6324a3a..0000000
--- a/src/caffe/layers/embed_layer.cu
+++ /dev/null
@@ -1,81 +0,0 @@
-#include <vector>
-
-#include "caffe/filler.hpp"
-#include "caffe/layers/embed_layer.hpp"
-#include "caffe/util/gpu_util.cuh"
-#include "caffe/util/math_functions.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-__global__ void EmbedForward(const int nthreads, const Dtype* bottom_data,
-    const Dtype* weight, const int M, const int N, const int K,
-    Dtype* top_data) {
-  CUDA_KERNEL_LOOP(top_index, nthreads) {
-    const int n = top_index / N;
-    const int d = top_index % N;
-    const int index = static_cast<int>(bottom_data[n]);
-    const int weight_index = index * N + d;
-    top_data[top_index] = weight[weight_index];
-  }
-}
-
-template <typename Dtype>
-__global__ void EmbedBackward(const int nthreads, const Dtype* bottom_data,
-    const Dtype* top_diff, const int M, const int N, const int K,
-    Dtype* weight_diff);
-
-template <typename Dtype>
-__global__ void EmbedBackward(const int nthreads, const Dtype* bottom_data,
-    const Dtype* top_diff, const int M, const int N, const int K,
-    Dtype* weight_diff) {
-  CUDA_KERNEL_LOOP(top_index, nthreads) {
-    const int n = top_index / N;
-    const int d = top_index % N;
-    const int index = static_cast<int>(bottom_data[n]);
-    const int weight_index = index * N + d;
-    caffe_gpu_atomic_add(top_diff[top_index], weight_diff + weight_index);
-  }
-}
-
-template <typename Dtype>
-void EmbedLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  const Dtype* weight = this->blobs_[0]->gpu_data();
-  const int count = top[0]->count();
-  EmbedForward<Dtype>  // NOLINT_NEXT_LINE(whitespace/operators)
-      <<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-      count, bottom_data, weight, M_, N_, K_, top_data);
-  if (bias_term_) {
-    caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, M_, N_, 1, Dtype(1),
-        bias_multiplier_.gpu_data(),
-        this->blobs_[1]->gpu_data(), Dtype(1), top_data);
-  }
-}
-
-template <typename Dtype>
-void EmbedLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  CHECK(!propagate_down[0]) << "Can't backpropagate to EmbedLayer input.";
-  if (this->param_propagate_down_[0]) {
-    const int top_count = top[0]->count();
-    const Dtype* top_diff = top[0]->gpu_diff();
-    const Dtype* bottom_data = bottom[0]->gpu_data();
-    Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff();
-    EmbedBackward<Dtype>  // NOLINT_NEXT_LINE(whitespace/operators)
-        <<<CAFFE_GET_BLOCKS(top_count), CAFFE_CUDA_NUM_THREADS>>>(
-        top_count, bottom_data, top_diff, M_, N_, K_, weight_diff);
-  }
-  if (bias_term_ && this->param_propagate_down_[1]) {
-    const Dtype* top_diff = top[0]->gpu_diff();
-    Dtype* bias_diff = this->blobs_[1]->mutable_gpu_diff();
-    caffe_gpu_gemv<Dtype>(CblasTrans, M_, N_, Dtype(1), top_diff,
-        bias_multiplier_.gpu_data(), Dtype(1), bias_diff);
-  }
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(EmbedLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/euclidean_loss_layer.cpp b/src/caffe/layers/euclidean_loss_layer.cpp
index 300d991..38de456 100644
--- a/src/caffe/layers/euclidean_loss_layer.cpp
+++ b/src/caffe/layers/euclidean_loss_layer.cpp
@@ -45,10 +45,6 @@ void EuclideanLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
-#ifdef CPU_ONLY
-STUB_GPU(EuclideanLossLayer);
-#endif
-
 INSTANTIATE_CLASS(EuclideanLossLayer);
 REGISTER_LAYER_CLASS(EuclideanLoss);
 
diff --git a/src/caffe/layers/euclidean_loss_layer.cu b/src/caffe/layers/euclidean_loss_layer.cu
deleted file mode 100644
index 4c221b6..0000000
--- a/src/caffe/layers/euclidean_loss_layer.cu
+++ /dev/null
@@ -1,42 +0,0 @@
-#include <vector>
-
-#include "caffe/layers/euclidean_loss_layer.hpp"
-#include "caffe/util/math_functions.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void EuclideanLossLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  int count = bottom[0]->count();
-  caffe_gpu_sub(
-      count,
-      bottom[0]->gpu_data(),
-      bottom[1]->gpu_data(),
-      diff_.mutable_gpu_data());
-  Dtype dot;
-  caffe_gpu_dot(count, diff_.gpu_data(), diff_.gpu_data(), &dot);
-  Dtype loss = dot / bottom[0]->num() / Dtype(2);
-  top[0]->mutable_cpu_data()[0] = loss;
-}
-
-template <typename Dtype>
-void EuclideanLossLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  for (int i = 0; i < 2; ++i) {
-    if (propagate_down[i]) {
-      const Dtype sign = (i == 0) ? 1 : -1;
-      const Dtype alpha = sign * top[0]->cpu_diff()[0] / bottom[i]->num();
-      caffe_gpu_axpby(
-          bottom[i]->count(),              // count
-          alpha,                              // alpha
-          diff_.gpu_data(),                   // a
-          Dtype(0),                           // beta
-          bottom[i]->mutable_gpu_diff());  // b
-    }
-  }
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(EuclideanLossLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/exp_layer.cpp b/src/caffe/layers/exp_layer.cpp
index 0c1b463..cb64e92 100644
--- a/src/caffe/layers/exp_layer.cpp
+++ b/src/caffe/layers/exp_layer.cpp
@@ -58,10 +58,6 @@ void ExpLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
-#ifdef CPU_ONLY
-STUB_GPU(ExpLayer);
-#endif
-
 INSTANTIATE_CLASS(ExpLayer);
 REGISTER_LAYER_CLASS(Exp);
 
diff --git a/src/caffe/layers/exp_layer.cu b/src/caffe/layers/exp_layer.cu
deleted file mode 100644
index 61f7f11..0000000
--- a/src/caffe/layers/exp_layer.cu
+++ /dev/null
@@ -1,42 +0,0 @@
-#include <vector>
-
-#include "caffe/layers/exp_layer.hpp"
-#include "caffe/util/math_functions.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void ExpLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const int count = bottom[0]->count();
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  if (inner_scale_ == Dtype(1)) {
-    caffe_gpu_exp(count, bottom_data, top_data);
-  } else {
-    caffe_gpu_scale(count, inner_scale_, bottom_data, top_data);
-    caffe_gpu_exp(count, top_data, top_data);
-  }
-  if (outer_scale_ != Dtype(1)) {
-    caffe_gpu_scal(count, outer_scale_, top_data);
-  }
-}
-
-template <typename Dtype>
-void ExpLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  if (!propagate_down[0]) { return; }
-  const int count = bottom[0]->count();
-  const Dtype* top_data = top[0]->gpu_data();
-  const Dtype* top_diff = top[0]->gpu_diff();
-  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-  caffe_gpu_mul(count, top_data, top_diff, bottom_diff);
-  if (inner_scale_ != Dtype(1)) {
-    caffe_gpu_scal(count, inner_scale_, bottom_diff);
-  }
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(ExpLayer);
-
-
-}  // namespace caffe
diff --git a/src/caffe/layers/filter_layer.cpp b/src/caffe/layers/filter_layer.cpp
index e226c0b..e023ad0 100644
--- a/src/caffe/layers/filter_layer.cpp
+++ b/src/caffe/layers/filter_layer.cpp
@@ -115,10 +115,6 @@ void FilterLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
-#ifdef CPU_ONLY
-STUB_GPU(FilterLayer);
-#endif
-
 INSTANTIATE_CLASS(FilterLayer);
 REGISTER_LAYER_CLASS(Filter);
 
diff --git a/src/caffe/layers/filter_layer.cu b/src/caffe/layers/filter_layer.cu
deleted file mode 100644
index b01b16f..0000000
--- a/src/caffe/layers/filter_layer.cu
+++ /dev/null
@@ -1,69 +0,0 @@
-#include <vector>
-
-#include "caffe/layers/filter_layer.hpp"
-#include "caffe/util/math_functions.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void FilterLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  int new_tops_num = indices_to_forward_.size();
-  // forward all filtered items for all bottoms but the Selector (bottom[last])
-  for (int t = 0; t < top.size(); ++t) {
-    const Dtype* bottom_data = bottom[t]->gpu_data();
-    Dtype* top_data = top[t]->mutable_gpu_data();
-    int dim = bottom[t]->count() / bottom[t]->shape(0);
-    for (int n = 0; n < new_tops_num; ++n) {
-      int data_offset_top = n * dim;
-      int data_offset_bottom = indices_to_forward_[n] * dim;
-      caffe_copy(dim, bottom_data + data_offset_bottom,
-          top_data + data_offset_top);
-    }
-  }
-}
-
-template <typename Dtype>
-void FilterLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  if (propagate_down[bottom.size() - 1]) {
-    LOG(FATAL) << this->type()
-               << "Layer cannot backpropagate to filter index inputs";
-  }
-  for (int i = 0; i < top.size(); ++i) {
-    // bottom[last] is the selector and never needs backpropagation
-    // so we can iterate over top vector because top.size() == bottom.size() -1
-    if (propagate_down[i]) {
-      const int dim = top[i]->count() / top[i]->shape(0);
-      int next_to_backward_offset = 0;
-      int batch_offset = 0;
-      int data_offset_bottom = 0;
-      int data_offset_top = 0;
-      for (int n = 0; n < bottom[i]->shape(0); ++n) {
-        if (next_to_backward_offset >= indices_to_forward_.size()) {
-          // we already visited all items that were been forwarded, so
-          // just set to zero remaining ones
-          data_offset_bottom = n * dim;
-          caffe_gpu_set(dim, Dtype(0),
-              bottom[i]->mutable_gpu_diff() + data_offset_bottom);
-        } else {
-          batch_offset = indices_to_forward_[next_to_backward_offset];
-          data_offset_bottom = n * dim;
-          if (n != batch_offset) {  // this data was not been forwarded
-            caffe_gpu_set(dim, Dtype(0),
-                bottom[i]->mutable_gpu_diff() + data_offset_bottom);
-          } else {  // this data was been forwarded
-            data_offset_top = next_to_backward_offset * dim;
-            ++next_to_backward_offset;  // point to next forwarded item index
-            caffe_copy(dim, top[i]->mutable_gpu_diff() + data_offset_top,
-                bottom[i]->mutable_gpu_diff() + data_offset_bottom);
-          }
-        }
-      }
-    }
-  }
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(FilterLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/im2col_layer.cpp b/src/caffe/layers/im2col_layer.cpp
index 2fb9b3c..5c90811 100644
--- a/src/caffe/layers/im2col_layer.cpp
+++ b/src/caffe/layers/im2col_layer.cpp
@@ -183,10 +183,6 @@ void Im2colLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
-#ifdef CPU_ONLY
-STUB_GPU(Im2colLayer);
-#endif
-
 INSTANTIATE_CLASS(Im2colLayer);
 REGISTER_LAYER_CLASS(Im2col);
 
diff --git a/src/caffe/layers/im2col_layer.cu b/src/caffe/layers/im2col_layer.cu
deleted file mode 100644
index 792c97f..0000000
--- a/src/caffe/layers/im2col_layer.cu
+++ /dev/null
@@ -1,62 +0,0 @@
-#include <vector>
-
-#include "caffe/layers/im2col_layer.hpp"
-#include "caffe/util/im2col.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void Im2colLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  const int num_kernels = channels_ * top[0]->count(channel_axis_ + 1);
-  for (int n = 0; n < num_; ++n) {
-    if (!force_nd_im2col_ && num_spatial_axes_ == 2) {
-      im2col_gpu(bottom_data + n * bottom_dim_, channels_,
-          bottom[0]->shape(channel_axis_ + 1),
-          bottom[0]->shape(channel_axis_ + 2),
-          kernel_shape_.cpu_data()[0], kernel_shape_.cpu_data()[1],
-          pad_.cpu_data()[0], pad_.cpu_data()[1],
-          stride_.cpu_data()[0], stride_.cpu_data()[1],
-          dilation_.cpu_data()[0], dilation_.cpu_data()[1],
-          top_data + n * top_dim_);
-    } else {
-      im2col_nd_gpu(bottom_data + n * bottom_dim_, num_spatial_axes_,
-          num_kernels, bottom[0]->gpu_shape() + channel_axis_,
-          top[0]->gpu_shape() + channel_axis_,
-          kernel_shape_.gpu_data(), pad_.gpu_data(), stride_.gpu_data(),
-          dilation_.gpu_data(), top_data + n * top_dim_);
-    }
-  }
-}
-
-template <typename Dtype>
-void Im2colLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  const Dtype* top_diff = top[0]->gpu_diff();
-  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-  for (int n = 0; n < num_; ++n) {
-    if (!force_nd_im2col_ && num_spatial_axes_ == 2) {
-      col2im_gpu(top_diff + n * top_dim_, channels_,
-          bottom[0]->shape(channel_axis_ + 1),
-          bottom[0]->shape(channel_axis_ + 2),
-          kernel_shape_.cpu_data()[0], kernel_shape_.cpu_data()[1],
-          pad_.cpu_data()[0], pad_.cpu_data()[1],
-          stride_.cpu_data()[0], stride_.cpu_data()[1],
-          dilation_.cpu_data()[0], dilation_.cpu_data()[1],
-          bottom_diff + n * bottom_dim_);
-    } else {
-      col2im_nd_gpu(top_diff + n * top_dim_, num_spatial_axes_, bottom_dim_,
-          bottom[0]->gpu_shape() + channel_axis_,
-          top[0]->gpu_shape() + channel_axis_,
-          kernel_shape_.gpu_data(), pad_.gpu_data(), stride_.gpu_data(),
-          dilation_.gpu_data(), bottom_diff + n * bottom_dim_);
-    }
-  }
-}
-
-
-INSTANTIATE_LAYER_GPU_FUNCS(Im2colLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/image_data_layer.cpp b/src/caffe/layers/image_data_layer.cpp
deleted file mode 100644
index 7ee7dc4..0000000
--- a/src/caffe/layers/image_data_layer.cpp
+++ /dev/null
@@ -1,174 +0,0 @@
-#ifdef USE_OPENCV
-#include <opencv2/core/core.hpp>
-
-#include <fstream>  // NOLINT(readability/streams)
-#include <iostream>  // NOLINT(readability/streams)
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "caffe/data_transformer.hpp"
-#include "caffe/layers/base_data_layer.hpp"
-#include "caffe/layers/image_data_layer.hpp"
-#include "caffe/util/benchmark.hpp"
-#include "caffe/util/io.hpp"
-#include "caffe/util/math_functions.hpp"
-#include "caffe/util/rng.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-ImageDataLayer<Dtype>::~ImageDataLayer<Dtype>() {
-  this->StopInternalThread();
-}
-
-template <typename Dtype>
-void ImageDataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  const int new_height = this->layer_param_.image_data_param().new_height();
-  const int new_width  = this->layer_param_.image_data_param().new_width();
-  const bool is_color  = this->layer_param_.image_data_param().is_color();
-  string root_folder = this->layer_param_.image_data_param().root_folder();
-
-  CHECK((new_height == 0 && new_width == 0) ||
-      (new_height > 0 && new_width > 0)) << "Current implementation requires "
-      "new_height and new_width to be set at the same time.";
-  // Read the file with filenames and labels
-  const string& source = this->layer_param_.image_data_param().source();
-  LOG(INFO) << "Opening file " << source;
-  std::ifstream infile(source.c_str());
-  string line;
-  size_t pos;
-  int label;
-  while (std::getline(infile, line)) {
-    pos = line.find_last_of(' ');
-    label = atoi(line.substr(pos + 1).c_str());
-    lines_.push_back(std::make_pair(line.substr(0, pos), label));
-  }
-
-  CHECK(!lines_.empty()) << "File is empty";
-
-  if (this->layer_param_.image_data_param().shuffle()) {
-    // randomly shuffle data
-    LOG(INFO) << "Shuffling data";
-    const unsigned int prefetch_rng_seed = caffe_rng_rand();
-    prefetch_rng_.reset(new Caffe::RNG(prefetch_rng_seed));
-    ShuffleImages();
-  }
-  LOG(INFO) << "A total of " << lines_.size() << " images.";
-
-  lines_id_ = 0;
-  // Check if we would need to randomly skip a few data points
-  if (this->layer_param_.image_data_param().rand_skip()) {
-    unsigned int skip = caffe_rng_rand() %
-        this->layer_param_.image_data_param().rand_skip();
-    LOG(INFO) << "Skipping first " << skip << " data points.";
-    CHECK_GT(lines_.size(), skip) << "Not enough points to skip";
-    lines_id_ = skip;
-  }
-  // Read an image, and use it to initialize the top blob.
-  cv::Mat cv_img = ReadImageToCVMat(root_folder + lines_[lines_id_].first,
-                                    new_height, new_width, is_color);
-  CHECK(cv_img.data) << "Could not load " << lines_[lines_id_].first;
-  // Use data_transformer to infer the expected blob shape from a cv_image.
-  vector<int> top_shape = this->data_transformer_->InferBlobShape(cv_img);
-  this->transformed_data_.Reshape(top_shape);
-  // Reshape prefetch_data and top[0] according to the batch_size.
-  const int batch_size = this->layer_param_.image_data_param().batch_size();
-  CHECK_GT(batch_size, 0) << "Positive batch size required";
-  top_shape[0] = batch_size;
-  for (int i = 0; i < this->PREFETCH_COUNT; ++i) {
-    this->prefetch_[i].data_.Reshape(top_shape);
-  }
-  top[0]->Reshape(top_shape);
-
-  LOG(INFO) << "output data size: " << top[0]->num() << ","
-      << top[0]->channels() << "," << top[0]->height() << ","
-      << top[0]->width();
-  // label
-  vector<int> label_shape(1, batch_size);
-  top[1]->Reshape(label_shape);
-  for (int i = 0; i < this->PREFETCH_COUNT; ++i) {
-    this->prefetch_[i].label_.Reshape(label_shape);
-  }
-}
-
-template <typename Dtype>
-void ImageDataLayer<Dtype>::ShuffleImages() {
-  caffe::rng_t* prefetch_rng =
-      static_cast<caffe::rng_t*>(prefetch_rng_->generator());
-  shuffle(lines_.begin(), lines_.end(), prefetch_rng);
-}
-
-// This function is called on prefetch thread
-template <typename Dtype>
-void ImageDataLayer<Dtype>::load_batch(Batch<Dtype>* batch) {
-  CPUTimer batch_timer;
-  batch_timer.Start();
-  double read_time = 0;
-  double trans_time = 0;
-  CPUTimer timer;
-  CHECK(batch->data_.count());
-  CHECK(this->transformed_data_.count());
-  ImageDataParameter image_data_param = this->layer_param_.image_data_param();
-  const int batch_size = image_data_param.batch_size();
-  const int new_height = image_data_param.new_height();
-  const int new_width = image_data_param.new_width();
-  const bool is_color = image_data_param.is_color();
-  string root_folder = image_data_param.root_folder();
-
-  // Reshape according to the first image of each batch
-  // on single input batches allows for inputs of varying dimension.
-  cv::Mat cv_img = ReadImageToCVMat(root_folder + lines_[lines_id_].first,
-      new_height, new_width, is_color);
-  CHECK(cv_img.data) << "Could not load " << lines_[lines_id_].first;
-  // Use data_transformer to infer the expected blob shape from a cv_img.
-  vector<int> top_shape = this->data_transformer_->InferBlobShape(cv_img);
-  this->transformed_data_.Reshape(top_shape);
-  // Reshape batch according to the batch_size.
-  top_shape[0] = batch_size;
-  batch->data_.Reshape(top_shape);
-
-  Dtype* prefetch_data = batch->data_.mutable_cpu_data();
-  Dtype* prefetch_label = batch->label_.mutable_cpu_data();
-
-  // datum scales
-  const int lines_size = lines_.size();
-  for (int item_id = 0; item_id < batch_size; ++item_id) {
-    // get a blob
-    timer.Start();
-    CHECK_GT(lines_size, lines_id_);
-    cv::Mat cv_img = ReadImageToCVMat(root_folder + lines_[lines_id_].first,
-        new_height, new_width, is_color);
-    CHECK(cv_img.data) << "Could not load " << lines_[lines_id_].first;
-    read_time += timer.MicroSeconds();
-    timer.Start();
-    // Apply transformations (mirror, crop...) to the image
-    int offset = batch->data_.offset(item_id);
-    this->transformed_data_.set_cpu_data(prefetch_data + offset);
-    this->data_transformer_->Transform(cv_img, &(this->transformed_data_));
-    trans_time += timer.MicroSeconds();
-
-    prefetch_label[item_id] = lines_[lines_id_].second;
-    // go to the next iter
-    lines_id_++;
-    if (lines_id_ >= lines_size) {
-      // We have reached the end. Restart from the first.
-      DLOG(INFO) << "Restarting data prefetching from start.";
-      lines_id_ = 0;
-      if (this->layer_param_.image_data_param().shuffle()) {
-        ShuffleImages();
-      }
-    }
-  }
-  batch_timer.Stop();
-  DLOG(INFO) << "Prefetch batch: " << batch_timer.MilliSeconds() << " ms.";
-  DLOG(INFO) << "     Read time: " << read_time / 1000 << " ms.";
-  DLOG(INFO) << "Transform time: " << trans_time / 1000 << " ms.";
-}
-
-INSTANTIATE_CLASS(ImageDataLayer);
-REGISTER_LAYER_CLASS(ImageData);
-
-}  // namespace caffe
-#endif  // USE_OPENCV
diff --git a/src/caffe/layers/inner_product_layer.cpp b/src/caffe/layers/inner_product_layer.cpp
index e65349f..5838c7c 100644
--- a/src/caffe/layers/inner_product_layer.cpp
+++ b/src/caffe/layers/inner_product_layer.cpp
@@ -140,10 +140,6 @@ void InnerProductLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
-#ifdef CPU_ONLY
-STUB_GPU(InnerProductLayer);
-#endif
-
 INSTANTIATE_CLASS(InnerProductLayer);
 REGISTER_LAYER_CLASS(InnerProduct);
 
diff --git a/src/caffe/layers/inner_product_layer.cu b/src/caffe/layers/inner_product_layer.cu
deleted file mode 100644
index a58b56e..0000000
--- a/src/caffe/layers/inner_product_layer.cu
+++ /dev/null
@@ -1,79 +0,0 @@
-#include <vector>
-
-#include "caffe/filler.hpp"
-#include "caffe/layers/inner_product_layer.hpp"
-#include "caffe/util/math_functions.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void InnerProductLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  const Dtype* weight = this->blobs_[0]->gpu_data();
-  if (M_ == 1) {
-    caffe_gpu_gemv<Dtype>(CblasNoTrans, N_, K_, (Dtype)1.,
-                         weight, bottom_data, (Dtype)0., top_data);
-    if (bias_term_)
-      caffe_gpu_axpy<Dtype>(N_, bias_multiplier_.cpu_data()[0],
-                            this->blobs_[1]->gpu_data(), top_data);
-  } else {
-    caffe_gpu_gemm<Dtype>(CblasNoTrans,
-                          transpose_ ? CblasNoTrans : CblasTrans,
-                          M_, N_, K_, (Dtype)1.,
-                          bottom_data, weight, (Dtype)0., top_data);
-    if (bias_term_)
-      caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, M_, N_, 1, (Dtype)1.,
-                            bias_multiplier_.gpu_data(),
-                            this->blobs_[1]->gpu_data(), (Dtype)1., top_data);
-  }
-}
-
-template <typename Dtype>
-void InnerProductLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
-  if (this->param_propagate_down_[0]) {
-    const Dtype* top_diff = top[0]->gpu_diff();
-    const Dtype* bottom_data = bottom[0]->gpu_data();
-    // Gradient with respect to weight
-    if (transpose_) {
-      caffe_gpu_gemm<Dtype>(CblasTrans, CblasNoTrans,
-          K_, N_, M_,
-          (Dtype)1., bottom_data, top_diff,
-          (Dtype)1., this->blobs_[0]->mutable_gpu_diff());
-    } else {
-      caffe_gpu_gemm<Dtype>(CblasTrans, CblasNoTrans,
-          N_, K_, M_,
-          (Dtype)1., top_diff, bottom_data,
-          (Dtype)1., this->blobs_[0]->mutable_gpu_diff());
-    }
-  }
-  if (bias_term_ && this->param_propagate_down_[1]) {
-    const Dtype* top_diff = top[0]->gpu_diff();
-    // Gradient with respect to bias
-    caffe_gpu_gemv<Dtype>(CblasTrans, M_, N_, (Dtype)1., top_diff,
-        bias_multiplier_.gpu_data(), (Dtype)1.,
-        this->blobs_[1]->mutable_gpu_diff());
-  }
-  if (propagate_down[0]) {
-    const Dtype* top_diff = top[0]->gpu_diff();
-    // Gradient with respect to bottom data
-    if (transpose_) {
-      caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasTrans,
-          M_, K_, N_,
-          (Dtype)1., top_diff, this->blobs_[0]->gpu_data(),
-          (Dtype)0., bottom[0]->mutable_gpu_diff());
-    } else {
-      caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans,
-          M_, K_, N_,
-         (Dtype)1., top_diff, this->blobs_[0]->gpu_data(),
-         (Dtype)0., bottom[0]->mutable_gpu_diff());
-    }
-  }
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(InnerProductLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/log_layer.cpp b/src/caffe/layers/log_layer.cpp
index c70a795..a9e05c9 100644
--- a/src/caffe/layers/log_layer.cpp
+++ b/src/caffe/layers/log_layer.cpp
@@ -75,10 +75,6 @@ void LogLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   caffe_mul(count, top_diff, bottom_diff, bottom_diff);
 }
 
-#ifdef CPU_ONLY
-STUB_GPU(LogLayer);
-#endif
-
 INSTANTIATE_CLASS(LogLayer);
 REGISTER_LAYER_CLASS(Log);
 
diff --git a/src/caffe/layers/log_layer.cu b/src/caffe/layers/log_layer.cu
deleted file mode 100644
index db466db..0000000
--- a/src/caffe/layers/log_layer.cu
+++ /dev/null
@@ -1,55 +0,0 @@
-#include <vector>
-
-#include "caffe/layers/log_layer.hpp"
-#include "caffe/util/math_functions.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void LogLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const int count = bottom[0]->count();
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  if (input_scale_ == Dtype(1) && input_shift_ == Dtype(0)) {
-    caffe_gpu_log(count, bottom_data, top_data);
-  } else {
-    caffe_copy(count, bottom_data, top_data);
-    if (input_scale_ != Dtype(1)) {
-      caffe_gpu_scal(count, input_scale_, top_data);
-    }
-    if (input_shift_ != Dtype(0)) {
-      caffe_gpu_add_scalar(count, input_shift_, top_data);
-    }
-    caffe_gpu_log(count, top_data, top_data);
-  }
-  if (base_scale_ != Dtype(1)) {
-    caffe_gpu_scal(count, base_scale_, top_data);
-  }
-}
-
-template <typename Dtype>
-void LogLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  if (!propagate_down[0]) { return; }
-    const int count = bottom[0]->count();
-    const Dtype* bottom_data = bottom[0]->gpu_data();
-    const Dtype* top_diff = top[0]->gpu_diff();
-    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-    caffe_copy(count, bottom_data, bottom_diff);
-    if (input_scale_ != Dtype(1)) {
-      caffe_gpu_scal(count, input_scale_, bottom_diff);
-    }
-    if (input_shift_ != Dtype(0)) {
-      caffe_gpu_add_scalar(count, input_shift_, bottom_diff);
-    }
-    caffe_gpu_powx(count, bottom_diff, Dtype(-1), bottom_diff);
-    if (backward_num_scale_ != Dtype(1)) {
-      caffe_gpu_scal(count, backward_num_scale_, bottom_diff);
-    }
-    caffe_gpu_mul(count, top_diff, bottom_diff, bottom_diff);
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(LogLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/lrn_layer.cpp b/src/caffe/layers/lrn_layer.cpp
index 210525e..d7b8bd9 100644
--- a/src/caffe/layers/lrn_layer.cpp
+++ b/src/caffe/layers/lrn_layer.cpp
@@ -246,12 +246,6 @@ void LRNLayer<Dtype>::WithinChannelBackward(
   }
 }
 
-#ifdef CPU_ONLY
-STUB_GPU(LRNLayer);
-STUB_GPU_FORWARD(LRNLayer, CrossChannelForward);
-STUB_GPU_BACKWARD(LRNLayer, CrossChannelBackward);
-#endif
-
 INSTANTIATE_CLASS(LRNLayer);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/lrn_layer.cu b/src/caffe/layers/lrn_layer.cu
deleted file mode 100644
index 26e619c..0000000
--- a/src/caffe/layers/lrn_layer.cu
+++ /dev/null
@@ -1,202 +0,0 @@
-#include <vector>
-
-#include "caffe/layers/lrn_layer.hpp"
-#include "caffe/util/math_functions.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-__global__ void LRNFillScale(const int nthreads, const Dtype* const in,
-    const int num, const int channels, const int height,
-    const int width, const int size, const Dtype alpha_over_size,
-    const Dtype k, Dtype* const scale) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    // find out the local offset
-    const int w = index % width;
-    const int h = (index / width) % height;
-    const int n = index / width / height;
-    const int offset = (n * channels * height + h) * width + w;
-    const int step = height * width;
-    const Dtype* const in_off = in + offset;
-    Dtype* const scale_off = scale + offset;
-    int head = 0;
-    const int pre_pad = (size - 1) / 2;
-    const int post_pad = size - pre_pad - 1;
-    Dtype accum_scale = 0;
-    // fill the scale at [n, :, h, w]
-    // accumulate values
-    while (head < post_pad && head < channels) {
-      accum_scale += in_off[head * step] * in_off[head * step];
-      ++head;
-    }
-    // both add and subtract
-    while (head < channels) {
-      accum_scale += in_off[head * step] * in_off[head * step];
-      if (head - size >= 0) {
-        accum_scale -= in_off[(head - size) * step]
-                       * in_off[(head - size) * step];
-      }
-      scale_off[(head - post_pad) * step] = k + accum_scale * alpha_over_size;
-      ++head;
-    }
-    // subtract only
-    while (head < channels + post_pad) {
-      if (head - size >= 0) {
-        accum_scale -= in_off[(head - size) * step]
-                       * in_off[(head - size) * step];
-      }
-      scale_off[(head - post_pad) * step] = k + accum_scale * alpha_over_size;
-      ++head;
-    }
-  }
-}
-
-
-template <typename Dtype>
-void LRNLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  switch (this->layer_param_.lrn_param().norm_region()) {
-  case LRNParameter_NormRegion_ACROSS_CHANNELS:
-    CrossChannelForward_gpu(bottom, top);
-    break;
-  case LRNParameter_NormRegion_WITHIN_CHANNEL:
-    WithinChannelForward(bottom, top);
-    break;
-  default:
-    LOG(FATAL) << "Unknown normalization region.";
-  }
-}
-
-// TODO: check if it would be faster to just put it into the previous kernel.
-template <typename Dtype>
-__global__ void LRNComputeOutput(const int nthreads, const Dtype* const in,
-    const Dtype* const scale, const Dtype negative_beta, Dtype* const out) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    out[index] = in[index] * pow(scale[index], negative_beta);
-  }
-}
-
-template <typename Dtype>
-void LRNLayer<Dtype>::CrossChannelForward_gpu(
-    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  // First, compute scale
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  Dtype* scale_data = scale_.mutable_gpu_data();
-  // We will launch one kernel for each pixel location, and have the kernel
-  // go through all the channels.
-  int n_threads = num_ * height_ * width_;
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  LRNFillScale<<<CAFFE_GET_BLOCKS(n_threads), CAFFE_CUDA_NUM_THREADS>>>(
-      n_threads, bottom_data, num_, channels_, height_, width_, size_,
-      alpha_ / size_, k_, scale_data);
-  CUDA_POST_KERNEL_CHECK;
-  n_threads = bottom[0]->count();
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  LRNComputeOutput<<<CAFFE_GET_BLOCKS(n_threads), CAFFE_CUDA_NUM_THREADS>>>(
-      n_threads, bottom_data, scale_data, -beta_, top_data);
-  CUDA_POST_KERNEL_CHECK;
-}
-template void LRNLayer<float>::CrossChannelForward_gpu(
-    const vector<Blob<float>*>& bottom, const vector<Blob<float>*>& top);
-template void LRNLayer<double>::CrossChannelForward_gpu(
-    const vector<Blob<double>*>& bottom, const vector<Blob<double>*>& top);
-
-
-template <typename Dtype>
-void LRNLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  switch (this->layer_param_.lrn_param().norm_region()) {
-  case LRNParameter_NormRegion_ACROSS_CHANNELS:
-    CrossChannelBackward_gpu(top, propagate_down, bottom);
-    break;
-  case LRNParameter_NormRegion_WITHIN_CHANNEL:
-    WithinChannelBackward(top, propagate_down, bottom);
-    break;
-  default:
-    LOG(FATAL) << "Unknown normalization region.";
-  }
-}
-
-template <typename Dtype>
-__global__ void LRNComputeDiff(const int nthreads,
-    const Dtype* const bottom_data, const Dtype* const top_data,
-    const Dtype* const scale, const Dtype* const top_diff,
-    const int num, const int channels, const int height,
-    const int width, const int size, const Dtype negative_beta,
-    const Dtype cache_ratio, Dtype* const bottom_diff) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    // find out the local offset
-    const int w = index % width;
-    const int h = (index / width) % height;
-    const int n = index / width / height;
-    const int offset = (n * channels * height + h) * width + w;
-    const int step = height * width;
-    const Dtype* const bottom_off = bottom_data + offset;
-    const Dtype* const top_off = top_data + offset;
-    const Dtype* const scale_off = scale + offset;
-    const Dtype* const top_diff_off = top_diff + offset;
-    Dtype* const bottom_diff_off = bottom_diff + offset;
-    int head = 0;
-    const int pre_pad = size - (size + 1) / 2;
-    const int post_pad = size - pre_pad - 1;
-    Dtype accum_ratio = 0;
-    // accumulate values
-    while (head < post_pad && head < channels) {
-      accum_ratio += top_diff_off[head * step] * top_off[head * step] /
-          scale_off[head * step];
-      ++head;
-    }
-    // both add and subtract
-    while (head < channels) {
-      accum_ratio += top_diff_off[head * step] * top_off[head * step] /
-          scale_off[head * step];
-      if (head - size >= 0) {
-        accum_ratio -= top_diff_off[(head - size) * step] *
-            top_off[(head - size) * step] / scale_off[(head - size) * step];
-      }
-      bottom_diff_off[(head - post_pad) * step] =
-          top_diff_off[(head - post_pad) * step]
-            * pow(scale_off[(head - post_pad) * step], negative_beta)
-          - cache_ratio * bottom_off[(head - post_pad) * step] * accum_ratio;
-      ++head;
-    }
-    // subtract only
-    while (head < channels + post_pad) {
-      if (head - size >= 0) {
-        accum_ratio -= top_diff_off[(head - size) * step] *
-            top_off[(head - size) * step] / scale_off[(head - size) * step];
-      }
-      bottom_diff_off[(head - post_pad) * step] =
-          top_diff_off[(head - post_pad) * step]
-            * pow(scale_off[(head - post_pad) * step], negative_beta)
-          - cache_ratio * bottom_off[(head - post_pad) * step] * accum_ratio;
-      ++head;
-    }
-  }
-}
-
-template <typename Dtype>
-void LRNLayer<Dtype>::CrossChannelBackward_gpu(
-    const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
-  int n_threads = num_ * height_ * width_;
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  LRNComputeDiff<<<CAFFE_GET_BLOCKS(n_threads), CAFFE_CUDA_NUM_THREADS>>>(
-      n_threads, bottom[0]->gpu_data(), top[0]->gpu_data(),
-      scale_.gpu_data(), top[0]->gpu_diff(), num_, channels_, height_, width_,
-      size_, -beta_, Dtype(2. * alpha_ * beta_ / size_),
-      bottom[0]->mutable_gpu_diff());
-}
-template void LRNLayer<float>::CrossChannelBackward_gpu(
-    const vector<Blob<float>*>& top, const vector<bool>& propagate_down,
-    const vector<Blob<float>*>& bottom);
-template void LRNLayer<double>::CrossChannelBackward_gpu(
-    const vector<Blob<double>*>& top, const vector<bool>& propagate_down,
-    const vector<Blob<double>*>& bottom);
-
-
-
-INSTANTIATE_LAYER_GPU_FUNCS(LRNLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/lstm_unit_layer.cpp b/src/caffe/layers/lstm_unit_layer.cpp
index 277c031..78617c5 100644
--- a/src/caffe/layers/lstm_unit_layer.cpp
+++ b/src/caffe/layers/lstm_unit_layer.cpp
@@ -121,10 +121,6 @@ void LSTMUnitLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
-#ifdef CPU_ONLY
-STUB_GPU(LSTMUnitLayer);
-#endif
-
 INSTANTIATE_CLASS(LSTMUnitLayer);
 REGISTER_LAYER_CLASS(LSTMUnit);
 
diff --git a/src/caffe/layers/lstm_unit_layer.cu b/src/caffe/layers/lstm_unit_layer.cu
deleted file mode 100644
index 15bb451..0000000
--- a/src/caffe/layers/lstm_unit_layer.cu
+++ /dev/null
@@ -1,154 +0,0 @@
-#include <algorithm>
-#include <cmath>
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/layers/lstm_layer.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-__device__ Dtype sigmoid(const Dtype x) {
-  return Dtype(1) / (Dtype(1) + exp(-x));
-}
-
-template <typename Dtype>
-__device__ Dtype tanh(const Dtype x) {
-  return Dtype(2) * sigmoid(Dtype(2) * x) - Dtype(1);
-}
-
-template <typename Dtype>
-__global__ void LSTMActsForward(const int nthreads, const int dim,
-                                const Dtype* X, Dtype* X_acts) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    const int x_dim = 4 * dim;
-    const int d = index % x_dim;
-    if (d < 3 * dim) {
-      X_acts[index] = sigmoid(X[index]);
-    } else {
-      X_acts[index] = tanh(X[index]);
-    }
-  }
-}
-
-template <typename Dtype>
-__global__ void LSTMUnitForward(const int nthreads, const int dim,
-    const Dtype* C_prev, const Dtype* X, const Dtype* cont,
-    Dtype* C, Dtype* H) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    const int n = index / dim;
-    const int d = index % dim;
-    const Dtype* X_offset = X + 4 * dim * n;
-    const Dtype i = X_offset[d];
-    const Dtype f = X_offset[1 * dim + d];
-    const Dtype o = X_offset[2 * dim + d];
-    const Dtype g = X_offset[3 * dim + d];
-    const Dtype c_prev = C_prev[index];
-    const Dtype c = cont[n] * f * c_prev + i * g;
-    C[index] = c;
-    const Dtype tanh_c = tanh(c);
-    H[index] = o * tanh_c;
-  }
-}
-
-template <typename Dtype>
-void LSTMUnitLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const int count = top[1]->count();
-  const Dtype* C_prev = bottom[0]->gpu_data();
-  const Dtype* X = bottom[1]->gpu_data();
-  const Dtype* cont = bottom[2]->gpu_data();
-  Dtype* X_acts = X_acts_.mutable_gpu_data();
-  Dtype* C = top[0]->mutable_gpu_data();
-  Dtype* H = top[1]->mutable_gpu_data();
-  const int X_count = bottom[1]->count();
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  LSTMActsForward<Dtype><<<CAFFE_GET_BLOCKS(X_count), CAFFE_CUDA_NUM_THREADS>>>(
-      X_count, hidden_dim_, X, X_acts);
-  CUDA_POST_KERNEL_CHECK;
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  LSTMUnitForward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-      count, hidden_dim_, C_prev, X_acts, cont, C, H);
-  CUDA_POST_KERNEL_CHECK;
-}
-
-template <typename Dtype>
-__global__ void LSTMUnitBackward(const int nthreads, const int dim,
-    const Dtype* C_prev, const Dtype* X, const Dtype* C, const Dtype* H,
-    const Dtype* cont, const Dtype* C_diff, const Dtype* H_diff,
-    Dtype* C_prev_diff, Dtype* X_diff) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    const int n = index / dim;
-    const int d = index % dim;
-    const Dtype* X_offset = X + 4 * dim * n;
-    const Dtype i = X_offset[d];
-    const Dtype f = X_offset[1 * dim + d];
-    const Dtype o = X_offset[2 * dim + d];
-    const Dtype g = X_offset[3 * dim + d];
-    const Dtype c_prev = C_prev[index];
-    const Dtype c = C[index];
-    const Dtype tanh_c = tanh(c);
-    Dtype* c_prev_diff = C_prev_diff + index;
-    Dtype* X_diff_offset = X_diff + 4 * dim * n;
-    Dtype* i_diff = X_diff_offset + d;
-    Dtype* f_diff = X_diff_offset + 1 * dim + d;
-    Dtype* o_diff = X_diff_offset + 2 * dim + d;
-    Dtype* g_diff = X_diff_offset + 3 * dim + d;
-    const Dtype c_term_diff =
-        C_diff[index] + H_diff[index] * o * (1 - tanh_c * tanh_c);
-    const Dtype cont_n = cont[n];
-    *c_prev_diff = cont_n * c_term_diff * f;
-    *i_diff = c_term_diff * g;
-    *f_diff = cont_n * c_term_diff * c_prev;
-    *o_diff = H_diff[index] * tanh_c;
-    *g_diff = c_term_diff * i;
-  }
-}
-
-template <typename Dtype>
-__global__ void LSTMActsBackward(const int nthreads, const int dim,
-    const Dtype* X_acts, const Dtype* X_acts_diff, Dtype* X_diff) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    const int x_dim = 4 * dim;
-    const int d = index % x_dim;
-    const Dtype X_act = X_acts[index];
-    if (d < 3 * dim) {
-      X_diff[index] = X_acts_diff[index] * X_act * (Dtype(1) - X_act);
-    } else {
-      X_diff[index] = X_acts_diff[index] * (Dtype(1) - X_act * X_act);
-    }
-  }
-}
-
-template <typename Dtype>
-void LSTMUnitLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
-  CHECK(!propagate_down[2]) << "Cannot backpropagate to sequence indicators.";
-  if (!propagate_down[0] && !propagate_down[1]) { return; }
-
-  const int count = top[1]->count();
-  const Dtype* C_prev = bottom[0]->gpu_data();
-  const Dtype* X_acts = X_acts_.gpu_data();
-  const Dtype* cont = bottom[2]->gpu_data();
-  const Dtype* C = top[0]->gpu_data();
-  const Dtype* H = top[1]->gpu_data();
-  const Dtype* C_diff = top[0]->gpu_diff();
-  const Dtype* H_diff = top[1]->gpu_diff();
-  Dtype* C_prev_diff = bottom[0]->mutable_gpu_diff();
-  Dtype* X_acts_diff = X_acts_.mutable_gpu_diff();
-  LSTMUnitBackward<Dtype>  // NOLINT_NEXT_LINE(whitespace/operators)
-      <<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(count, hidden_dim_,
-      C_prev, X_acts, C, H, cont, C_diff, H_diff, C_prev_diff, X_acts_diff);
-  CUDA_POST_KERNEL_CHECK;
-  const int X_count = bottom[1]->count();
-  Dtype* X_diff = bottom[1]->mutable_gpu_diff();
-  LSTMActsBackward<Dtype>  // NOLINT_NEXT_LINE(whitespace/operators)
-      <<<CAFFE_GET_BLOCKS(X_count), CAFFE_CUDA_NUM_THREADS>>>(
-      X_count, hidden_dim_, X_acts, X_acts_diff, X_diff);
-  CUDA_POST_KERNEL_CHECK;
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(LSTMUnitLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/memory_data_layer.cpp b/src/caffe/layers/memory_data_layer.cpp
deleted file mode 100644
index 975f484..0000000
--- a/src/caffe/layers/memory_data_layer.cpp
+++ /dev/null
@@ -1,123 +0,0 @@
-#ifdef USE_OPENCV
-#include <opencv2/core/core.hpp>
-#endif  // USE_OPENCV
-
-#include <vector>
-
-#include "caffe/layers/memory_data_layer.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void MemoryDataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
-     const vector<Blob<Dtype>*>& top) {
-  batch_size_ = this->layer_param_.memory_data_param().batch_size();
-  channels_ = this->layer_param_.memory_data_param().channels();
-  height_ = this->layer_param_.memory_data_param().height();
-  width_ = this->layer_param_.memory_data_param().width();
-  size_ = channels_ * height_ * width_;
-  CHECK_GT(batch_size_ * size_, 0) <<
-      "batch_size, channels, height, and width must be specified and"
-      " positive in memory_data_param";
-  vector<int> label_shape(1, batch_size_);
-  top[0]->Reshape(batch_size_, channels_, height_, width_);
-  top[1]->Reshape(label_shape);
-  added_data_.Reshape(batch_size_, channels_, height_, width_);
-  added_label_.Reshape(label_shape);
-  data_ = NULL;
-  labels_ = NULL;
-  added_data_.cpu_data();
-  added_label_.cpu_data();
-}
-
-template <typename Dtype>
-void MemoryDataLayer<Dtype>::AddDatumVector(const vector<Datum>& datum_vector) {
-  CHECK(!has_new_data_) <<
-      "Can't add data until current data has been consumed.";
-  size_t num = datum_vector.size();
-  CHECK_GT(num, 0) << "There is no datum to add.";
-  CHECK_EQ(num % batch_size_, 0) <<
-      "The added data must be a multiple of the batch size.";
-  added_data_.Reshape(num, channels_, height_, width_);
-  added_label_.Reshape(num, 1, 1, 1);
-  // Apply data transformations (mirror, scale, crop...)
-  this->data_transformer_->Transform(datum_vector, &added_data_);
-  // Copy Labels
-  Dtype* top_label = added_label_.mutable_cpu_data();
-  for (int item_id = 0; item_id < num; ++item_id) {
-    top_label[item_id] = datum_vector[item_id].label();
-  }
-  // num_images == batch_size_
-  Dtype* top_data = added_data_.mutable_cpu_data();
-  Reset(top_data, top_label, num);
-  has_new_data_ = true;
-}
-
-#ifdef USE_OPENCV
-template <typename Dtype>
-void MemoryDataLayer<Dtype>::AddMatVector(const vector<cv::Mat>& mat_vector,
-    const vector<int>& labels) {
-  size_t num = mat_vector.size();
-  CHECK(!has_new_data_) <<
-      "Can't add mat until current data has been consumed.";
-  CHECK_GT(num, 0) << "There is no mat to add";
-  CHECK_EQ(num % batch_size_, 0) <<
-      "The added data must be a multiple of the batch size.";
-  added_data_.Reshape(num, channels_, height_, width_);
-  added_label_.Reshape(num, 1, 1, 1);
-  // Apply data transformations (mirror, scale, crop...)
-  this->data_transformer_->Transform(mat_vector, &added_data_);
-  // Copy Labels
-  Dtype* top_label = added_label_.mutable_cpu_data();
-  for (int item_id = 0; item_id < num; ++item_id) {
-    top_label[item_id] = labels[item_id];
-  }
-  // num_images == batch_size_
-  Dtype* top_data = added_data_.mutable_cpu_data();
-  Reset(top_data, top_label, num);
-  has_new_data_ = true;
-}
-#endif  // USE_OPENCV
-
-template <typename Dtype>
-void MemoryDataLayer<Dtype>::Reset(Dtype* data, Dtype* labels, int n) {
-  CHECK(data);
-  CHECK(labels);
-  CHECK_EQ(n % batch_size_, 0) << "n must be a multiple of batch size";
-  // Warn with transformation parameters since a memory array is meant to
-  // be generic and no transformations are done with Reset().
-  if (this->layer_param_.has_transform_param()) {
-    LOG(WARNING) << this->type() << " does not transform array data on Reset()";
-  }
-  data_ = data;
-  labels_ = labels;
-  n_ = n;
-  pos_ = 0;
-}
-
-template <typename Dtype>
-void MemoryDataLayer<Dtype>::set_batch_size(int new_size) {
-  CHECK(!has_new_data_) <<
-      "Can't change batch_size until current data has been consumed.";
-  batch_size_ = new_size;
-  added_data_.Reshape(batch_size_, channels_, height_, width_);
-  added_label_.Reshape(batch_size_, 1, 1, 1);
-}
-
-template <typename Dtype>
-void MemoryDataLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  CHECK(data_) << "MemoryDataLayer needs to be initialized by calling Reset";
-  top[0]->Reshape(batch_size_, channels_, height_, width_);
-  top[1]->Reshape(batch_size_, 1, 1, 1);
-  top[0]->set_cpu_data(data_ + pos_ * size_);
-  top[1]->set_cpu_data(labels_ + pos_);
-  pos_ = (pos_ + batch_size_) % n_;
-  if (pos_ == 0)
-    has_new_data_ = false;
-}
-
-INSTANTIATE_CLASS(MemoryDataLayer);
-REGISTER_LAYER_CLASS(MemoryData);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/mvn_layer.cpp b/src/caffe/layers/mvn_layer.cpp
index 8fe4ef8..31c381d 100644
--- a/src/caffe/layers/mvn_layer.cpp
+++ b/src/caffe/layers/mvn_layer.cpp
@@ -123,11 +123,6 @@ void MVNLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
-
-#ifdef CPU_ONLY
-STUB_GPU(MVNLayer);
-#endif
-
 INSTANTIATE_CLASS(MVNLayer);
 REGISTER_LAYER_CLASS(MVN);
 
diff --git a/src/caffe/layers/mvn_layer.cu b/src/caffe/layers/mvn_layer.cu
deleted file mode 100644
index 739293b..0000000
--- a/src/caffe/layers/mvn_layer.cu
+++ /dev/null
@@ -1,110 +0,0 @@
-#include <vector>
-
-#include "caffe/layers/mvn_layer.hpp"
-#include "caffe/util/math_functions.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void MVNLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  int num;
-  if (this->layer_param_.mvn_param().across_channels())
-    num = bottom[0]->num();
-  else
-    num = bottom[0]->num() * bottom[0]->channels();
-
-  int dim = bottom[0]->count() / num;
-
-  // subtract mean
-  caffe_gpu_gemv<Dtype>(CblasNoTrans, num, dim, 1. / dim, bottom_data,
-      sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data());  // EX
-  caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, dim, 1, -1.,
-      mean_.gpu_data(), sum_multiplier_.gpu_data(), 0.,
-      temp_.mutable_gpu_data());
-  caffe_gpu_add(temp_.count(), bottom_data, temp_.gpu_data(),
-      top_data);  // X-EX
-
-  if (this->layer_param_.mvn_param().normalize_variance()) {
-    // compute variance using var(X) = E((X-EX)^2)
-    caffe_gpu_powx(bottom[0]->count(), top_data, Dtype(2),
-        temp_.mutable_gpu_data());  // (X-EX)^2
-    caffe_gpu_gemv<Dtype>(CblasNoTrans, num, dim, 1. / dim, temp_.gpu_data(),
-        sum_multiplier_.gpu_data(), 0.,
-        variance_.mutable_gpu_data());  // E((X-EX)^2)
-
-    // normalize variance
-    caffe_gpu_powx(variance_.count(), variance_.gpu_data(), Dtype(0.5),
-          variance_.mutable_gpu_data());
-
-    caffe_gpu_add_scalar(variance_.count(), eps_, variance_.mutable_gpu_data());
-
-    caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, dim, 1, 1.,
-          variance_.gpu_data(), sum_multiplier_.gpu_data(), 0.,
-          temp_.mutable_gpu_data());
-
-    caffe_gpu_div(temp_.count(), top_data, temp_.gpu_data(), top_data);
-  }
-}
-
-template <typename Dtype>
-void MVNLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
-  const Dtype* top_diff = top[0]->gpu_diff();
-  const Dtype* top_data = top[0]->gpu_data();
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-
-  int num;
-  if (this->layer_param_.mvn_param().across_channels())
-    num = bottom[0]->num();
-  else
-    num = bottom[0]->num() * bottom[0]->channels();
-
-  int dim = bottom[0]->count() / num;
-
-  if (this->layer_param_.mvn_param().normalize_variance()) {
-    caffe_gpu_mul(temp_.count(), top_data, top_diff, bottom_diff);
-    caffe_gpu_gemv<Dtype>(CblasNoTrans, num, dim, 1., bottom_diff,
-          sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data());
-    caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, dim, 1, 1.,
-          mean_.gpu_data(), sum_multiplier_.gpu_data(), 0.,
-          bottom_diff);
-    caffe_gpu_mul(temp_.count(), top_data, bottom_diff, bottom_diff);
-
-    caffe_gpu_gemv<Dtype>(CblasNoTrans, num, dim, 1., top_diff,
-            sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data());
-    caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, dim, 1, 1.,
-            mean_.gpu_data(), sum_multiplier_.gpu_data(), 1.,
-            bottom_diff);
-
-    caffe_gpu_axpby(temp_.count(), Dtype(1), top_diff, Dtype(-1. / dim),
-        bottom_diff);
-
-    // put the squares of bottom into temp_
-    caffe_gpu_powx(temp_.count(), bottom_data, Dtype(2),
-        temp_.mutable_gpu_data());
-
-    caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, dim, 1, 1.,
-        variance_.gpu_data(), sum_multiplier_.gpu_data(), 0.,
-        temp_.mutable_gpu_data());
-
-    caffe_gpu_div(temp_.count(), bottom_diff, temp_.gpu_data(), bottom_diff);
-  } else {
-    caffe_gpu_gemv<Dtype>(CblasNoTrans, num, dim, 1. / dim, top_diff,
-            sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data());
-    caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, dim, 1, -1.,
-            mean_.gpu_data(), sum_multiplier_.gpu_data(), 0.,
-            temp_.mutable_gpu_data());
-    caffe_gpu_add(temp_.count(), top_diff, temp_.gpu_data(), bottom_diff);
-  }
-}
-
-
-INSTANTIATE_LAYER_GPU_FUNCS(MVNLayer);
-
-
-}  // namespace caffe
diff --git a/src/caffe/layers/pooling_layer.cpp b/src/caffe/layers/pooling_layer.cpp
index 90897db..563c490 100644
--- a/src/caffe/layers/pooling_layer.cpp
+++ b/src/caffe/layers/pooling_layer.cpp
@@ -306,11 +306,6 @@ void PoolingLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
-
-#ifdef CPU_ONLY
-STUB_GPU(PoolingLayer);
-#endif
-
 INSTANTIATE_CLASS(PoolingLayer);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/pooling_layer.cu b/src/caffe/layers/pooling_layer.cu
deleted file mode 100644
index 1ea46cc..0000000
--- a/src/caffe/layers/pooling_layer.cu
+++ /dev/null
@@ -1,386 +0,0 @@
-#include <algorithm>
-#include <cfloat>
-#include <vector>
-
-#include "caffe/layers/pooling_layer.hpp"
-#include "caffe/util/math_functions.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-__global__ void MaxPoolForward(const int nthreads,
-    const Dtype* const bottom_data, const int num, const int channels,
-    const int height, const int width, const int pooled_height,
-    const int pooled_width, const int kernel_h, const int kernel_w,
-    const int stride_h, const int stride_w, const int pad_h, const int pad_w,
-    Dtype* const top_data, int* mask, Dtype* top_mask) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    const int pw = index % pooled_width;
-    const int ph = (index / pooled_width) % pooled_height;
-    const int c = (index / pooled_width / pooled_height) % channels;
-    const int n = index / pooled_width / pooled_height / channels;
-    int hstart = ph * stride_h - pad_h;
-    int wstart = pw * stride_w - pad_w;
-    const int hend = min(hstart + kernel_h, height);
-    const int wend = min(wstart + kernel_w, width);
-    hstart = max(hstart, 0);
-    wstart = max(wstart, 0);
-    Dtype maxval = -FLT_MAX;
-    int maxidx = -1;
-    const Dtype* const bottom_slice =
-        bottom_data + (n * channels + c) * height * width;
-    for (int h = hstart; h < hend; ++h) {
-      for (int w = wstart; w < wend; ++w) {
-        if (bottom_slice[h * width + w] > maxval) {
-          maxidx = h * width + w;
-          maxval = bottom_slice[maxidx];
-        }
-      }
-    }
-    top_data[index] = maxval;
-    if (mask) {
-      mask[index] = maxidx;
-    } else {
-      top_mask[index] = maxidx;
-    }
-  }
-}
-
-template <typename Dtype>
-__global__ void AvePoolForward(const int nthreads,
-    const Dtype* const bottom_data, const int num, const int channels,
-    const int height, const int width, const int pooled_height,
-    const int pooled_width, const int kernel_h, const int kernel_w,
-    const int stride_h, const int stride_w, const int pad_h, const int pad_w,
-    Dtype* const top_data) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    const int pw = index % pooled_width;
-    const int ph = (index / pooled_width) % pooled_height;
-    const int c = (index / pooled_width / pooled_height) % channels;
-    const int n = index / pooled_width / pooled_height / channels;
-    int hstart = ph * stride_h - pad_h;
-    int wstart = pw * stride_w - pad_w;
-    int hend = min(hstart + kernel_h, height + pad_h);
-    int wend = min(wstart + kernel_w, width + pad_w);
-    const int pool_size = (hend - hstart) * (wend - wstart);
-    hstart = max(hstart, 0);
-    wstart = max(wstart, 0);
-    hend = min(hend, height);
-    wend = min(wend, width);
-    Dtype aveval = 0;
-    const Dtype* const bottom_slice =
-        bottom_data + (n * channels + c) * height * width;
-    for (int h = hstart; h < hend; ++h) {
-      for (int w = wstart; w < wend; ++w) {
-        aveval += bottom_slice[h * width + w];
-      }
-    }
-    top_data[index] = aveval / pool_size;
-  }
-}
-
-template <typename Dtype>
-__global__ void StoPoolForwardTrain(const int nthreads,
-    const Dtype* const bottom_data,
-    const int num, const int channels, const int height,
-    const int width, const int pooled_height, const int pooled_width,
-    const int kernel_h, const int kernel_w, const int stride_h,
-    const int stride_w, Dtype* const rand_idx, Dtype* const top_data) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    const int pw = index % pooled_width;
-    const int ph = (index / pooled_width) % pooled_height;
-    const int c = (index / pooled_width / pooled_height) % channels;
-    const int n = index / pooled_width / pooled_height / channels;
-    const int hstart = ph * stride_h;
-    const int hend = min(hstart + kernel_h, height);
-    const int wstart = pw * stride_w;
-    const int wend = min(wstart + kernel_w, width);
-    Dtype cumsum = 0.;
-    const Dtype* const bottom_slice =
-        bottom_data + (n * channels + c) * height * width;
-    // First pass: get sum
-    for (int h = hstart; h < hend; ++h) {
-      for (int w = wstart; w < wend; ++w) {
-        cumsum += bottom_slice[h * width + w];
-      }
-    }
-    const float thres = rand_idx[index] * cumsum;
-    // Second pass: get value, and set index.
-    cumsum = 0;
-    for (int h = hstart; h < hend; ++h) {
-      for (int w = wstart; w < wend; ++w) {
-        cumsum += bottom_slice[h * width + w];
-        if (cumsum >= thres) {
-          rand_idx[index] = ((n * channels + c) * height + h) * width + w;
-          top_data[index] = bottom_slice[h * width + w];
-          return;
-        }
-      }
-    }
-  }
-}
-
-
-template <typename Dtype>
-__global__ void StoPoolForwardTest(const int nthreads,
-    const Dtype* const bottom_data,
-    const int num, const int channels, const int height,
-    const int width, const int pooled_height, const int pooled_width,
-    const int kernel_h, const int kernel_w, const int stride_h,
-    const int stride_w, Dtype* const top_data) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    const int pw = index % pooled_width;
-    const int ph = (index / pooled_width) % pooled_height;
-    const int c = (index / pooled_width / pooled_height) % channels;
-    const int n = index / pooled_width / pooled_height / channels;
-    const int hstart = ph * stride_h;
-    const int hend = min(hstart + kernel_h, height);
-    const int wstart = pw * stride_w;
-    const int wend = min(wstart + kernel_w, width);
-    // We set cumsum to be 0 to avoid divide-by-zero problems
-    Dtype cumsum = FLT_MIN;
-    Dtype cumvalues = 0.;
-    const Dtype* const bottom_slice =
-        bottom_data + (n * channels + c) * height * width;
-    // First pass: get sum
-    for (int h = hstart; h < hend; ++h) {
-      for (int w = wstart; w < wend; ++w) {
-        cumsum += bottom_slice[h * width + w];
-        cumvalues += bottom_slice[h * width + w] * bottom_slice[h * width + w];
-      }
-    }
-    top_data[index] = cumvalues / cumsum;
-  }
-}
-
-
-template <typename Dtype>
-void PoolingLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  int count = top[0]->count();
-  // We'll output the mask to top[1] if it's of size >1.
-  const bool use_top_mask = top.size() > 1;
-  int* mask = NULL;
-  Dtype* top_mask = NULL;
-  switch (this->layer_param_.pooling_param().pool()) {
-  case PoolingParameter_PoolMethod_MAX:
-    if (use_top_mask) {
-      top_mask = top[1]->mutable_gpu_data();
-    } else {
-      mask = max_idx_.mutable_gpu_data();
-    }
-    // NOLINT_NEXT_LINE(whitespace/operators)
-    MaxPoolForward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-        count, bottom_data, bottom[0]->num(), channels_,
-        height_, width_, pooled_height_, pooled_width_, kernel_h_,
-        kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, top_data,
-        mask, top_mask);
-    break;
-  case PoolingParameter_PoolMethod_AVE:
-    // NOLINT_NEXT_LINE(whitespace/operators)
-    AvePoolForward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-        count, bottom_data, bottom[0]->num(), channels_,
-        height_, width_, pooled_height_, pooled_width_, kernel_h_,
-        kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, top_data);
-    break;
-  case PoolingParameter_PoolMethod_STOCHASTIC:
-    if (this->phase_ == TRAIN) {
-      // We need to create the random index as well.
-      caffe_gpu_rng_uniform(count, Dtype(0), Dtype(1),
-                            rand_idx_.mutable_gpu_data());
-      // NOLINT_NEXT_LINE(whitespace/operators)
-      StoPoolForwardTrain<Dtype><<<CAFFE_GET_BLOCKS(count),
-                                   CAFFE_CUDA_NUM_THREADS>>>(
-          count, bottom_data, bottom[0]->num(), channels_,
-          height_, width_, pooled_height_, pooled_width_, kernel_h_,
-          kernel_w_, stride_h_, stride_w_,
-          rand_idx_.mutable_gpu_data(), top_data);
-    } else {
-      // NOLINT_NEXT_LINE(whitespace/operators)
-      StoPoolForwardTest<Dtype><<<CAFFE_GET_BLOCKS(count),
-                                  CAFFE_CUDA_NUM_THREADS>>>(
-          count, bottom_data, bottom[0]->num(), channels_,
-          height_, width_, pooled_height_, pooled_width_, kernel_h_,
-          kernel_w_, stride_h_, stride_w_, top_data);
-    }
-    break;
-  default:
-    LOG(FATAL) << "Unknown pooling method.";
-  }
-  CUDA_POST_KERNEL_CHECK;
-}
-
-
-template <typename Dtype>
-__global__ void MaxPoolBackward(const int nthreads, const Dtype* const top_diff,
-    const int* const mask, const Dtype* const top_mask, const int num,
-    const int channels, const int height, const int width,
-    const int pooled_height, const int pooled_width, const int kernel_h,
-    const int kernel_w, const int stride_h, const int stride_w, const int pad_h,
-    const int pad_w, Dtype* const bottom_diff) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    // find out the local index
-    // find out the local offset
-    const int w = index % width;
-    const int h = (index / width) % height;
-    const int c = (index / width / height) % channels;
-    const int n = index / width / height / channels;
-    const int phstart =
-         (h + pad_h < kernel_h) ? 0 : (h + pad_h - kernel_h) / stride_h + 1;
-    const int phend = min((h + pad_h) / stride_h + 1, pooled_height);
-    const int pwstart =
-         (w + pad_w < kernel_w) ? 0 : (w + pad_w - kernel_w) / stride_w + 1;
-    const int pwend = min((w + pad_w) / stride_w + 1, pooled_width);
-    Dtype gradient = 0;
-    const int offset = (n * channels + c) * pooled_height * pooled_width;
-    const Dtype* const top_diff_slice = top_diff + offset;
-    if (mask) {
-      const int* const mask_slice = mask + offset;
-      for (int ph = phstart; ph < phend; ++ph) {
-        for (int pw = pwstart; pw < pwend; ++pw) {
-          if (mask_slice[ph * pooled_width + pw] == h * width + w) {
-            gradient += top_diff_slice[ph * pooled_width + pw];
-          }
-        }
-      }
-    } else {
-      const Dtype* const top_mask_slice = top_mask + offset;
-      for (int ph = phstart; ph < phend; ++ph) {
-        for (int pw = pwstart; pw < pwend; ++pw) {
-          if (top_mask_slice[ph * pooled_width + pw] == h * width + w) {
-            gradient += top_diff_slice[ph * pooled_width + pw];
-          }
-        }
-      }
-    }
-    bottom_diff[index] = gradient;
-  }
-}
-
-template <typename Dtype>
-__global__ void AvePoolBackward(const int nthreads, const Dtype* const top_diff,
-    const int num, const int channels, const int height,
-    const int width, const int pooled_height, const int pooled_width,
-    const int kernel_h, const int kernel_w, const int stride_h,
-    const int stride_w, const int pad_h, const int pad_w,
-    Dtype* const bottom_diff) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    // find out the local index
-    // find out the local offset
-    const int w = index % width + pad_w;
-    const int h = (index / width) % height + pad_h;
-    const int c = (index / width / height) % channels;
-    const int n = index / width / height / channels;
-    const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
-    const int phend = min(h / stride_h + 1, pooled_height);
-    const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
-    const int pwend = min(w / stride_w + 1, pooled_width);
-    Dtype gradient = 0;
-    const Dtype* const top_diff_slice =
-        top_diff + (n * channels + c) * pooled_height * pooled_width;
-    for (int ph = phstart; ph < phend; ++ph) {
-      for (int pw = pwstart; pw < pwend; ++pw) {
-        // figure out the pooling size
-        int hstart = ph * stride_h - pad_h;
-        int wstart = pw * stride_w - pad_w;
-        int hend = min(hstart + kernel_h, height + pad_h);
-        int wend = min(wstart + kernel_w, width + pad_w);
-        int pool_size = (hend - hstart) * (wend - wstart);
-        gradient += top_diff_slice[ph * pooled_width + pw] / pool_size;
-      }
-    }
-    bottom_diff[index] = gradient;
-  }
-}
-
-
-template <typename Dtype>
-__global__ void StoPoolBackward(const int nthreads,
-    const Dtype* const rand_idx, const Dtype* const top_diff,
-    const int num, const int channels, const int height,
-    const int width, const int pooled_height, const int pooled_width,
-    const int kernel_h, const int kernel_w, const int stride_h,
-    const int stride_w, Dtype* const bottom_diff) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    // find out the local index
-    // find out the local offset
-    const int w = index % width;
-    const int h = (index / width) % height;
-    const int c = (index / width / height) % channels;
-    const int n = index / width / height / channels;
-    const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
-    const int phend = min(h / stride_h + 1, pooled_height);
-    const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
-    const int pwend = min(w / stride_w + 1, pooled_width);
-    Dtype gradient = 0;
-    const Dtype* const rand_idx_slice =
-        rand_idx + (n * channels + c) * pooled_height * pooled_width;
-    const Dtype* const top_diff_slice =
-        top_diff + (n * channels + c) * pooled_height * pooled_width;
-    for (int ph = phstart; ph < phend; ++ph) {
-      for (int pw = pwstart; pw < pwend; ++pw) {
-        gradient += top_diff_slice[ph * pooled_width + pw] *
-            (index == static_cast<int>(rand_idx_slice[ph * pooled_width + pw]));
-      }
-    }
-    bottom_diff[index] = gradient;
-  }
-}
-
-
-template <typename Dtype>
-void PoolingLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  if (!propagate_down[0]) {
-    return;
-  }
-  const Dtype* top_diff = top[0]->gpu_diff();
-  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-  const int count = bottom[0]->count();
-  caffe_gpu_set(count, Dtype(0.), bottom_diff);
-  // We'll output the mask to top[1] if it's of size >1.
-  const bool use_top_mask = top.size() > 1;
-  const int* mask = NULL;
-  const Dtype* top_mask = NULL;
-  switch (this->layer_param_.pooling_param().pool()) {
-  case PoolingParameter_PoolMethod_MAX:
-    if (use_top_mask) {
-      top_mask = top[1]->gpu_data();
-    } else {
-      mask = max_idx_.gpu_data();
-    }
-    // NOLINT_NEXT_LINE(whitespace/operators)
-    MaxPoolBackward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-        count, top_diff, mask, top_mask, top[0]->num(), channels_,
-        height_, width_, pooled_height_, pooled_width_,
-        kernel_h_, kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_,
-        bottom_diff);
-    break;
-  case PoolingParameter_PoolMethod_AVE:
-    // NOLINT_NEXT_LINE(whitespace/operators)
-    AvePoolBackward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-        count, top_diff, top[0]->num(), channels_,
-        height_, width_, pooled_height_, pooled_width_, kernel_h_,
-        kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, bottom_diff);
-    break;
-  case PoolingParameter_PoolMethod_STOCHASTIC:
-    // NOLINT_NEXT_LINE(whitespace/operators)
-    StoPoolBackward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-        count, rand_idx_.gpu_data(), top_diff,
-        top[0]->num(), channels_, height_, width_, pooled_height_,
-        pooled_width_, kernel_h_, kernel_w_, stride_h_, stride_w_,
-        bottom_diff);
-    break;
-  default:
-    LOG(FATAL) << "Unknown pooling method.";
-  }
-  CUDA_POST_KERNEL_CHECK;
-}
-
-
-INSTANTIATE_LAYER_GPU_FUNCS(PoolingLayer);
-
-
-}  // namespace caffe
diff --git a/src/caffe/layers/power_layer.cpp b/src/caffe/layers/power_layer.cpp
index d99b77c..2cc9920 100644
--- a/src/caffe/layers/power_layer.cpp
+++ b/src/caffe/layers/power_layer.cpp
@@ -92,10 +92,6 @@ void PowerLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
-#ifdef CPU_ONLY
-STUB_GPU(PowerLayer);
-#endif
-
 INSTANTIATE_CLASS(PowerLayer);
 REGISTER_LAYER_CLASS(Power);
 
diff --git a/src/caffe/layers/power_layer.cu b/src/caffe/layers/power_layer.cu
deleted file mode 100644
index 07711c4..0000000
--- a/src/caffe/layers/power_layer.cu
+++ /dev/null
@@ -1,85 +0,0 @@
-#include <vector>
-
-#include "caffe/layers/power_layer.hpp"
-#include "caffe/util/math_functions.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void PowerLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  const int count = bottom[0]->count();
-  // Special case where we can ignore the input: scale or power is 0.
-  if (diff_scale_ == Dtype(0)) {
-    Dtype value = (power_ == 0) ? Dtype(1) : pow(shift_, power_);
-    caffe_gpu_set(count, value, top_data);
-    return;
-  }
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  caffe_copy(count, bottom_data, top_data);
-  if (scale_ != Dtype(1)) {
-    caffe_gpu_scal(count, scale_, top_data);
-  }
-  if (shift_ != Dtype(0)) {
-    caffe_gpu_add_scalar(count, shift_, top_data);
-  }
-  if (power_ != Dtype(1)) {
-    caffe_gpu_powx(count, top_data, power_, top_data);
-  }
-}
-
-template <typename Dtype>
-void PowerLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
-  if (propagate_down[0]) {
-    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-    const int count = bottom[0]->count();
-    const Dtype* top_diff = top[0]->gpu_diff();
-    if (diff_scale_ == Dtype(0) || power_ == Dtype(1)) {
-      caffe_gpu_set(count, diff_scale_, bottom_diff);
-    } else {
-      const Dtype* bottom_data = bottom[0]->gpu_data();
-      // Compute dy/dx = scale * power * (shift + scale * x)^(power - 1)
-      //               = diff_scale * y / (shift + scale * x)
-      if (power_ == Dtype(2)) {
-        // Special case for y = (shift + scale * x)^2
-        //     -> dy/dx = 2 * scale * (shift + scale * x)
-        //              = diff_scale * shift + diff_scale * scale * x
-        caffe_gpu_axpby(count, diff_scale_ * scale_, bottom_data,
-            Dtype(0), bottom_diff);
-        if (shift_ != Dtype(0)) {
-          caffe_gpu_add_scalar(count, diff_scale_ * shift_, bottom_diff);
-        }
-      } else if (shift_ == Dtype(0)) {
-        // Special case for y = (scale * x)^power
-        //     -> dy/dx = scale * power * (scale * x)^(power - 1)
-        //              = scale * power * (scale * x)^power * (scale * x)^(-1)
-        //              = power * y / x
-        const Dtype* top_data = top[0]->gpu_data();
-        caffe_gpu_div(count, top_data, bottom_data, bottom_diff);
-        caffe_gpu_scal(count, power_, bottom_diff);
-      } else {
-        caffe_copy(count, bottom_data, bottom_diff);
-        if (scale_ != Dtype(1)) {
-          caffe_gpu_scal(count, scale_, bottom_diff);
-        }
-        if (shift_ != Dtype(0)) {
-          caffe_gpu_add_scalar(count, shift_, bottom_diff);
-        }
-        const Dtype* top_data = top[0]->gpu_data();
-        caffe_gpu_div<Dtype>(count, top_data, bottom_diff, bottom_diff);
-        if (diff_scale_ != Dtype(1)) {
-          caffe_gpu_scal(count, diff_scale_, bottom_diff);
-        }
-      }
-    }
-    caffe_gpu_mul(count, top_diff, bottom_diff, bottom_diff);
-  }
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(PowerLayer);
-
-
-}  // namespace caffe
diff --git a/src/caffe/layers/prelu_layer.cpp b/src/caffe/layers/prelu_layer.cpp
index 853181b..e4f965f 100644
--- a/src/caffe/layers/prelu_layer.cpp
+++ b/src/caffe/layers/prelu_layer.cpp
@@ -130,11 +130,6 @@ void PReLULayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
-
-#ifdef CPU_ONLY
-STUB_GPU(PReLULayer);
-#endif
-
 INSTANTIATE_CLASS(PReLULayer);
 REGISTER_LAYER_CLASS(PReLU);
 
diff --git a/src/caffe/layers/prelu_layer.cu b/src/caffe/layers/prelu_layer.cu
deleted file mode 100644
index aeb80ea..0000000
--- a/src/caffe/layers/prelu_layer.cu
+++ /dev/null
@@ -1,128 +0,0 @@
-#include <algorithm>
-#include <vector>
-
-#include "caffe/layers/neuron_layer.hpp"
-#include "caffe/layers/prelu_layer.hpp"
-
-namespace caffe {
-
-// CUDA kernele for forward
-template <typename Dtype>
-__global__ void PReLUForward(const int n, const int channels, const int dim,
-    const Dtype* in, Dtype* out, const Dtype* slope_data,
-    const int div_factor) {
-  CUDA_KERNEL_LOOP(index, n) {
-    int c = (index / dim) % channels / div_factor;
-    out[index] = in[index] > 0 ? in[index] : in[index] * slope_data[c];
-  }
-}
-
-// CUDA kernel for bottom backward
-template <typename Dtype>
-__global__ void PReLUBackward(const int n, const int channels, const int dim,
-    const Dtype* in_diff, const Dtype* in_data, Dtype* out_diff,
-    const Dtype* slope_data, const int div_factor) {
-  CUDA_KERNEL_LOOP(index, n) {
-    int c = (index / dim) % channels / div_factor;
-    out_diff[index] = in_diff[index] * ((in_data[index] > 0)
-        + (in_data[index] <= 0) * slope_data[c]);
-  }
-}
-
-// CUDA kernel for element-wise parameter backward
-template <typename Dtype>
-__global__ void PReLUParamBackward(const int n,
-    const int rows, const int rowPitch, const Dtype* in_diff,
-    const Dtype* in_data, Dtype* out_diff) {
-  CUDA_KERNEL_LOOP(index, n) {
-    out_diff[index] = in_diff[index] * in_data[index] * (in_data[index] <= 0);
-    for ( int k = 1; k < rows; k++ ) {
-        out_diff[index] += in_diff[index + k*rowPitch]
-           * in_data[index + k*rowPitch] * (in_data[index + k*rowPitch] <= 0);
-    }
-  }
-}
-
-template <typename Dtype>
-void PReLULayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  const int count = bottom[0]->count();
-  const int dim = bottom[0]->count(2);
-  const int channels = bottom[0]->channels();
-  const Dtype* slope_data = this->blobs_[0]->gpu_data();
-  const int div_factor = channel_shared_ ? channels : 1;
-
-  // For in-place computation
-  if (top[0] == bottom[0]) {
-    caffe_copy(count, bottom_data, bottom_memory_.mutable_gpu_data());
-  }
-
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  PReLUForward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-      count, channels, dim, bottom_data, top_data, slope_data, div_factor);
-  CUDA_POST_KERNEL_CHECK;
-}
-
-template <typename Dtype>
-void PReLULayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  const Dtype* top_diff = top[0]->gpu_diff();
-  const int count = bottom[0]->count();
-  const int dim = bottom[0]->count(2);
-  const int channels = bottom[0]->channels();
-
-  // For in-place computation
-  if (top[0] == bottom[0]) {
-    bottom_data = bottom_memory_.gpu_data();
-  }
-
-  // Propagate to param
-  // Since to write bottom diff will affect top diff if top and bottom blobs
-  // are identical (in-place computaion), we first compute param backward to
-  // keep top_diff unchanged.
-  if (this->param_propagate_down_[0]) {
-    Dtype* slope_diff = this->blobs_[0]->mutable_gpu_diff();
-    int cdim = channels * dim;
-
-    // compute element-wise diff
-    // NOLINT_NEXT_LINE(whitespace/operators)
-    PReLUParamBackward<Dtype><<<CAFFE_GET_BLOCKS(cdim),
-      CAFFE_CUDA_NUM_THREADS>>>(
-      cdim, bottom[0]->num(), top[0]->offset(1), top_diff ,
-      bottom_data ,
-      backward_buff_.mutable_gpu_diff());
-    CUDA_POST_KERNEL_CHECK;
-    if (channel_shared_) {
-      Dtype dsum;
-      caffe_gpu_dot<Dtype>(channels * dim, backward_buff_.gpu_diff(),
-       multiplier_.gpu_data(), &dsum);
-      caffe_gpu_add_scalar(this->blobs_[0]->count(), Dtype(dsum), slope_diff);
-    } else {
-      caffe_gpu_gemv<Dtype>(CblasNoTrans, channels, dim, 1.,
-        backward_buff_.gpu_diff(), multiplier_.gpu_data(), 1.,
-        slope_diff);
-    }
-  }
-  // Propagate to bottom
-  if (propagate_down[0]) {
-    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-    const Dtype* slope_data = this->blobs_[0]->gpu_data();
-    int div_factor = channel_shared_ ? channels : 1;
-    // NOLINT_NEXT_LINE(whitespace/operators)
-    PReLUBackward<Dtype><<<CAFFE_GET_BLOCKS(count),
-        CAFFE_CUDA_NUM_THREADS>>>(
-        count, channels, dim, top_diff, bottom_data, bottom_diff, slope_data,
-        div_factor);
-    CUDA_POST_KERNEL_CHECK;
-  }
-}
-
-
-INSTANTIATE_LAYER_GPU_FUNCS(PReLULayer);
-
-
-}  // namespace caffe
diff --git a/src/caffe/layers/recurrent_layer.cpp b/src/caffe/layers/recurrent_layer.cpp
index e0c8277..9147eb0 100644
--- a/src/caffe/layers/recurrent_layer.cpp
+++ b/src/caffe/layers/recurrent_layer.cpp
@@ -286,10 +286,6 @@ void RecurrentLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   unrolled_net_->BackwardFrom(last_layer_index_);
 }
 
-#ifdef CPU_ONLY
-STUB_GPU_FORWARD(RecurrentLayer, Forward);
-#endif
-
 INSTANTIATE_CLASS(RecurrentLayer);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/recurrent_layer.cu b/src/caffe/layers/recurrent_layer.cu
deleted file mode 100644
index 4dd2b0e..0000000
--- a/src/caffe/layers/recurrent_layer.cu
+++ /dev/null
@@ -1,44 +0,0 @@
-#include <vector>
-
-#include "caffe/blob.hpp"
-#include "caffe/common.hpp"
-#include "caffe/filler.hpp"
-#include "caffe/layer.hpp"
-#include "caffe/layers/recurrent_layer.hpp"
-#include "caffe/util/math_functions.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void RecurrentLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  // Hacky fix for test time... reshare all the shared blobs.
-  // TODO: somehow make this work non-hackily.
-  if (this->phase_ == TEST) {
-    unrolled_net_->ShareWeights();
-  }
-
-  DCHECK_EQ(recur_input_blobs_.size(), recur_output_blobs_.size());
-  if (!expose_hidden_) {
-    for (int i = 0; i < recur_input_blobs_.size(); ++i) {
-      const int count = recur_input_blobs_[i]->count();
-      DCHECK_EQ(count, recur_output_blobs_[i]->count());
-      const Dtype* timestep_T_data = recur_output_blobs_[i]->gpu_data();
-      Dtype* timestep_0_data = recur_input_blobs_[i]->mutable_gpu_data();
-      caffe_copy(count, timestep_T_data, timestep_0_data);
-    }
-  }
-
-  unrolled_net_->ForwardTo(last_layer_index_);
-
-  if (expose_hidden_) {
-    const int top_offset = output_blobs_.size();
-    for (int i = top_offset, j = 0; i < top.size(); ++i, ++j) {
-      top[i]->ShareData(*recur_output_blobs_[j]);
-    }
-  }
-}
-
-INSTANTIATE_LAYER_GPU_FORWARD(RecurrentLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/reduction_layer.cpp b/src/caffe/layers/reduction_layer.cpp
index fa46487..460b8fa 100644
--- a/src/caffe/layers/reduction_layer.cpp
+++ b/src/caffe/layers/reduction_layer.cpp
@@ -119,10 +119,6 @@ void ReductionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
-#ifdef CPU_ONLY
-STUB_GPU(ReductionLayer);
-#endif
-
 INSTANTIATE_CLASS(ReductionLayer);
 REGISTER_LAYER_CLASS(Reduction);
 
diff --git a/src/caffe/layers/reduction_layer.cu b/src/caffe/layers/reduction_layer.cu
deleted file mode 100644
index 4a6b2b7..0000000
--- a/src/caffe/layers/reduction_layer.cu
+++ /dev/null
@@ -1,91 +0,0 @@
-#include <vector>
-
-#include "caffe/layers/reduction_layer.hpp"
-#include "caffe/util/math_functions.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void ReductionLayer<Dtype>::Forward_gpu(
-    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  const Dtype* mult_data = NULL;
-  if (sum_multiplier_.count() > 0) {
-    mult_data = sum_multiplier_.gpu_data();
-  }
-  Dtype* top_data = top[0]->mutable_cpu_data();
-  for (int i = 0; i < num_; ++i) {
-    switch (op_) {
-    case ReductionParameter_ReductionOp_SUM:
-    case ReductionParameter_ReductionOp_MEAN:
-      caffe_gpu_dot(dim_, mult_data, bottom_data, top_data);
-      break;
-    case ReductionParameter_ReductionOp_ASUM:
-      caffe_gpu_asum(dim_, bottom_data, top_data);
-      break;
-    case ReductionParameter_ReductionOp_SUMSQ:
-      caffe_gpu_dot(dim_, bottom_data, bottom_data, top_data);
-      break;
-    default:
-      LOG(FATAL) << "Unknown reduction op: "
-          << ReductionParameter_ReductionOp_Name(op_);
-    }
-    bottom_data += dim_;
-    ++top_data;
-  }
-  if (coeff_ != Dtype(1)) {
-    // Reset the top_data pointer.
-    top_data = top[0]->mutable_gpu_data();
-    caffe_gpu_scal(num_, coeff_, top_data);
-  }
-}
-
-template <typename Dtype>
-void ReductionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  if (!propagate_down[0]) { return; }
-  // Get bottom_data, if needed.
-  const Dtype* bottom_data = NULL;
-  switch (op_) {
-  // Operations that don't need bottom_data
-  case ReductionParameter_ReductionOp_SUM:
-  case ReductionParameter_ReductionOp_MEAN:
-    break;
-  // Operations that need bottom_data
-  case ReductionParameter_ReductionOp_ASUM:
-  case ReductionParameter_ReductionOp_SUMSQ:
-    bottom_data = bottom[0]->gpu_data();
-    break;
-  default:
-    LOG(FATAL) << "Unknown reduction op: "
-        << ReductionParameter_ReductionOp_Name(op_);
-  }
-  const Dtype* top_diff = top[0]->cpu_diff();
-  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-  for (int i = 0; i < num_; ++i) {
-    const Dtype bottom_coeff = (*top_diff) * coeff_;
-    switch (op_) {
-    case ReductionParameter_ReductionOp_SUM:
-    case ReductionParameter_ReductionOp_MEAN:
-      caffe_gpu_set(dim_, bottom_coeff, bottom_diff);
-      break;
-    case ReductionParameter_ReductionOp_ASUM:
-      caffe_gpu_sign(dim_, bottom_data, bottom_diff);
-      caffe_gpu_scal(dim_, bottom_coeff, bottom_diff);
-      break;
-    case ReductionParameter_ReductionOp_SUMSQ:
-      caffe_gpu_scale(dim_, 2 * bottom_coeff, bottom_data, bottom_diff);
-      break;
-    default:
-      LOG(FATAL) << "Unknown reduction op: "
-          << ReductionParameter_ReductionOp_Name(op_);
-    }
-    bottom_data += dim_;
-    bottom_diff += dim_;
-    ++top_diff;
-  }
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(ReductionLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/relu_layer.cpp b/src/caffe/layers/relu_layer.cpp
index 92a729c..179d3db 100644
--- a/src/caffe/layers/relu_layer.cpp
+++ b/src/caffe/layers/relu_layer.cpp
@@ -35,11 +35,6 @@ void ReLULayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
-
-#ifdef CPU_ONLY
-STUB_GPU(ReLULayer);
-#endif
-
 INSTANTIATE_CLASS(ReLULayer);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/relu_layer.cu b/src/caffe/layers/relu_layer.cu
deleted file mode 100644
index 4bf15b3..0000000
--- a/src/caffe/layers/relu_layer.cu
+++ /dev/null
@@ -1,64 +0,0 @@
-#include <algorithm>
-#include <vector>
-
-#include "caffe/layers/relu_layer.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-__global__ void ReLUForward(const int n, const Dtype* in, Dtype* out,
-    Dtype negative_slope) {
-  CUDA_KERNEL_LOOP(index, n) {
-    out[index] = in[index] > 0 ? in[index] : in[index] * negative_slope;
-  }
-}
-
-template <typename Dtype>
-void ReLULayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  const int count = bottom[0]->count();
-  Dtype negative_slope = this->layer_param_.relu_param().negative_slope();
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  ReLUForward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-      count, bottom_data, top_data, negative_slope);
-  CUDA_POST_KERNEL_CHECK;
-  // << " count: " << count << " bottom_data: "
-  //     << (unsigned long)bottom_data
-  //     << " top_data: " << (unsigned long)top_data
-  //     << " blocks: " << CAFFE_GET_BLOCKS(count)
-  //     << " threads: " << CAFFE_CUDA_NUM_THREADS;
-}
-
-template <typename Dtype>
-__global__ void ReLUBackward(const int n, const Dtype* in_diff,
-    const Dtype* in_data, Dtype* out_diff, Dtype negative_slope) {
-  CUDA_KERNEL_LOOP(index, n) {
-    out_diff[index] = in_diff[index] * ((in_data[index] > 0)
-        + (in_data[index] <= 0) * negative_slope);
-  }
-}
-
-template <typename Dtype>
-void ReLULayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
-  if (propagate_down[0]) {
-    const Dtype* bottom_data = bottom[0]->gpu_data();
-    const Dtype* top_diff = top[0]->gpu_diff();
-    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-    const int count = bottom[0]->count();
-    Dtype negative_slope = this->layer_param_.relu_param().negative_slope();
-    // NOLINT_NEXT_LINE(whitespace/operators)
-    ReLUBackward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-        count, top_diff, bottom_data, bottom_diff, negative_slope);
-    CUDA_POST_KERNEL_CHECK;
-  }
-}
-
-
-INSTANTIATE_LAYER_GPU_FUNCS(ReLULayer);
-
-
-}  // namespace caffe
diff --git a/src/caffe/layers/scale_layer.cpp b/src/caffe/layers/scale_layer.cpp
index ecdbb12..21852a4 100644
--- a/src/caffe/layers/scale_layer.cpp
+++ b/src/caffe/layers/scale_layer.cpp
@@ -209,10 +209,6 @@ void ScaleLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
-#ifdef CPU_ONLY
-STUB_GPU(ScaleLayer);
-#endif
-
 INSTANTIATE_CLASS(ScaleLayer);
 REGISTER_LAYER_CLASS(Scale);
 
diff --git a/src/caffe/layers/scale_layer.cu b/src/caffe/layers/scale_layer.cu
deleted file mode 100644
index fc9a806..0000000
--- a/src/caffe/layers/scale_layer.cu
+++ /dev/null
@@ -1,135 +0,0 @@
-#include <cfloat>
-#include <vector>
-
-#include "caffe/layers/scale_layer.hpp"
-#include "caffe/util/math_functions.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-__global__ void ScaleForward(const int n, const Dtype* in,
-    const Dtype* scale, const int scale_dim, const int inner_dim,
-    Dtype* out) {
-  CUDA_KERNEL_LOOP(index, n) {
-    const int scale_index = (index / inner_dim) % scale_dim;
-    out[index] = in[index] * scale[scale_index];
-  }
-}
-
-template <typename Dtype>
-__global__ void ScaleBiasForward(const int n, const Dtype* in,
-    const Dtype* scale, const Dtype* bias,
-    const int scale_dim, const int inner_dim, Dtype* out) {
-  CUDA_KERNEL_LOOP(index, n) {
-    const int scale_index = (index / inner_dim) % scale_dim;
-    out[index] = in[index] * scale[scale_index] + bias[scale_index];
-  }
-}
-
-template <typename Dtype>
-void ScaleLayer<Dtype>::Forward_gpu(
-    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  const int count = top[0]->count();
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  if (bottom[0] == top[0]) {
-    // in-place computation; need to store bottom data before overwriting it.
-    // Note that this is only necessary for Backward; we could skip this if not
-    // doing Backward, but Caffe currently provides no way of knowing whether
-    // we'll need to do Backward at the time of the Forward call.
-    caffe_copy(bottom[0]->count(), bottom[0]->gpu_data(),
-               temp_.mutable_gpu_data());
-  }
-  const Dtype* scale_data =
-      ((bottom.size() > 1) ? bottom[1] : this->blobs_[0].get())->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  if (bias_layer_) {
-    const Dtype* bias_data = this->blobs_[bias_param_id_]->gpu_data();
-    ScaleBiasForward<Dtype>  // NOLINT_NEXT_LINE(whitespace/operators)
-        <<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-        count, bottom_data, scale_data, bias_data, scale_dim_, inner_dim_,
-        top_data);
-  } else {
-    ScaleForward<Dtype>  // NOLINT_NEXT_LINE(whitespace/operators)
-        <<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-        count, bottom_data, scale_data, scale_dim_, inner_dim_, top_data);
-  }
-}
-
-template <typename Dtype>
-void ScaleLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  if (bias_layer_ &&
-      this->param_propagate_down_[this->param_propagate_down_.size() - 1]) {
-    bias_layer_->Backward(top, bias_propagate_down_, bias_bottom_vec_);
-  }
-  const bool scale_param = (bottom.size() == 1);
-  Blob<Dtype>* scale = scale_param ? this->blobs_[0].get() : bottom[1];
-  if ((!scale_param && propagate_down[1]) ||
-      (scale_param && this->param_propagate_down_[0])) {
-    const Dtype* top_diff = top[0]->gpu_diff();
-    const bool in_place = (bottom[0] == top[0]);
-    const Dtype* bottom_data = (in_place ? &temp_ : bottom[0])->gpu_data();
-    // Hack: store big eltwise product in bottom[0] diff, except in the special
-    // case where this layer itself does the eltwise product, in which case we
-    // can store it directly in the scale diff, and we're done.
-    // If we're computing in-place (and not doing eltwise computation), this
-    // hack doesn't work and we store the product in temp_.
-    const bool is_eltwise = (bottom[0]->count() == scale->count());
-    Dtype* product = (is_eltwise ? scale->mutable_gpu_diff() :
-        (in_place ? temp_.mutable_gpu_data() : bottom[0]->mutable_gpu_diff()));
-    caffe_gpu_mul(top[0]->count(), top_diff, bottom_data, product);
-    if (!is_eltwise) {
-      Dtype* sum_result = NULL;
-      if (inner_dim_ == 1) {
-        sum_result = product;
-      } else if (sum_result_.count() == 1) {
-        const Dtype* sum_mult = sum_multiplier_.gpu_data();
-        Dtype* scale_diff = scale->mutable_cpu_diff();
-        if (scale_param) {
-          Dtype result;
-          caffe_gpu_dot(inner_dim_, product, sum_mult, &result);
-          *scale_diff += result;
-        } else {
-          caffe_gpu_dot(inner_dim_, product, sum_mult, scale_diff);
-        }
-      } else {
-        const Dtype* sum_mult = sum_multiplier_.gpu_data();
-        sum_result = (outer_dim_ == 1) ?
-            scale->mutable_gpu_diff() : sum_result_.mutable_gpu_data();
-        caffe_gpu_gemv(CblasNoTrans, sum_result_.count(), inner_dim_,
-                       Dtype(1), product, sum_mult, Dtype(0), sum_result);
-      }
-      if (outer_dim_ != 1) {
-        const Dtype* sum_mult = sum_multiplier_.gpu_data();
-        if (scale_dim_ == 1) {
-          Dtype* scale_diff = scale->mutable_cpu_diff();
-          if (scale_param) {
-            Dtype result;
-            caffe_gpu_dot(outer_dim_, sum_mult, sum_result, &result);
-            *scale_diff += result;
-          } else {
-            caffe_gpu_dot(outer_dim_, sum_mult, sum_result, scale_diff);
-          }
-        } else {
-          Dtype* scale_diff = scale->mutable_gpu_diff();
-          caffe_gpu_gemv(CblasTrans, outer_dim_, scale_dim_,
-                         Dtype(1), sum_result, sum_mult, Dtype(scale_param),
-                         scale_diff);
-        }
-      }
-    }
-  }
-  if (propagate_down[0]) {
-    const int count = top[0]->count();
-    const Dtype* top_diff = top[0]->gpu_diff();
-    const Dtype* scale_data = scale->gpu_data();
-    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-    ScaleForward<Dtype>  // NOLINT_NEXT_LINE(whitespace/operators)
-        <<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-        count, top_diff, scale_data, scale_dim_, inner_dim_, bottom_diff);
-  }
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(ScaleLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp b/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp
index 10ac947..2bc82f9 100644
--- a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp
+++ b/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp
@@ -67,10 +67,6 @@ void SigmoidCrossEntropyLossLayer<Dtype>::Backward_cpu(
   }
 }
 
-#ifdef CPU_ONLY
-STUB_GPU_BACKWARD(SigmoidCrossEntropyLossLayer, Backward);
-#endif
-
 INSTANTIATE_CLASS(SigmoidCrossEntropyLossLayer);
 REGISTER_LAYER_CLASS(SigmoidCrossEntropyLoss);
 
diff --git a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cu b/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cu
deleted file mode 100644
index 046cb9d..0000000
--- a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cu
+++ /dev/null
@@ -1,34 +0,0 @@
-#include <vector>
-
-#include "caffe/layers/sigmoid_cross_entropy_loss_layer.hpp"
-#include "caffe/util/math_functions.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void SigmoidCrossEntropyLossLayer<Dtype>::Backward_gpu(
-    const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
-  if (propagate_down[1]) {
-    LOG(FATAL) << this->type()
-               << " Layer cannot backpropagate to label inputs.";
-  }
-  if (propagate_down[0]) {
-    // First, compute the diff
-    const int count = bottom[0]->count();
-    const int num = bottom[0]->num();
-    const Dtype* sigmoid_output_data = sigmoid_output_->gpu_data();
-    const Dtype* target = bottom[1]->gpu_data();
-    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-    caffe_copy(count, sigmoid_output_data, bottom_diff);
-    caffe_gpu_axpy(count, Dtype(-1), target, bottom_diff);
-    // Scale down gradient
-    const Dtype loss_weight = top[0]->cpu_diff()[0];
-    caffe_gpu_scal(count, loss_weight / num, bottom_diff);
-  }
-}
-
-INSTANTIATE_LAYER_GPU_BACKWARD(SigmoidCrossEntropyLossLayer);
-
-
-}  // namespace caffe
diff --git a/src/caffe/layers/sigmoid_layer.cpp b/src/caffe/layers/sigmoid_layer.cpp
index 85fd967..8495fc2 100644
--- a/src/caffe/layers/sigmoid_layer.cpp
+++ b/src/caffe/layers/sigmoid_layer.cpp
@@ -37,10 +37,6 @@ void SigmoidLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
-#ifdef CPU_ONLY
-STUB_GPU(SigmoidLayer);
-#endif
-
 INSTANTIATE_CLASS(SigmoidLayer);
 
 
diff --git a/src/caffe/layers/sigmoid_layer.cu b/src/caffe/layers/sigmoid_layer.cu
deleted file mode 100644
index 184c61e..0000000
--- a/src/caffe/layers/sigmoid_layer.cu
+++ /dev/null
@@ -1,60 +0,0 @@
-#include <cmath>
-#include <vector>
-
-#include "caffe/layers/sigmoid_layer.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-__global__ void SigmoidForward(const int n, const Dtype* in, Dtype* out) {
-  CUDA_KERNEL_LOOP(index, n) {
-    out[index] = 1. / (1. + exp(-in[index]));
-  }
-}
-
-template <typename Dtype>
-void SigmoidLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  const int count = bottom[0]->count();
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  SigmoidForward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-      count, bottom_data, top_data);
-  CUDA_POST_KERNEL_CHECK;
-  // << " count: " << count << " bottom_data: "
-  //     << (unsigned long)bottom_data
-  //     << " top_data: " << (unsigned long)top_data
-  //     << " blocks: " << CAFFE_GET_BLOCKS(count)
-  //     << " threads: " << CAFFE_CUDA_NUM_THREADS;
-}
-
-template <typename Dtype>
-__global__ void SigmoidBackward(const int n, const Dtype* in_diff,
-    const Dtype* out_data, Dtype* out_diff) {
-  CUDA_KERNEL_LOOP(index, n) {
-    const Dtype sigmoid_x = out_data[index];
-    out_diff[index] = in_diff[index] * sigmoid_x * (1 - sigmoid_x);
-  }
-}
-
-template <typename Dtype>
-void SigmoidLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
-  if (propagate_down[0]) {
-    const Dtype* top_data = top[0]->gpu_data();
-    const Dtype* top_diff = top[0]->gpu_diff();
-    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-    const int count = bottom[0]->count();
-    // NOLINT_NEXT_LINE(whitespace/operators)
-    SigmoidBackward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-        count, top_diff, top_data, bottom_diff);
-    CUDA_POST_KERNEL_CHECK;
-  }
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(SigmoidLayer);
-
-
-}  // namespace caffe
diff --git a/src/caffe/layers/silence_layer.cpp b/src/caffe/layers/silence_layer.cpp
index b2f85c5..555066f 100644
--- a/src/caffe/layers/silence_layer.cpp
+++ b/src/caffe/layers/silence_layer.cpp
@@ -16,10 +16,6 @@ void SilenceLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
-#ifdef CPU_ONLY
-STUB_GPU(SilenceLayer);
-#endif
-
 INSTANTIATE_CLASS(SilenceLayer);
 REGISTER_LAYER_CLASS(Silence);
 
diff --git a/src/caffe/layers/silence_layer.cu b/src/caffe/layers/silence_layer.cu
deleted file mode 100644
index 3494f6f..0000000
--- a/src/caffe/layers/silence_layer.cu
+++ /dev/null
@@ -1,27 +0,0 @@
-#include <vector>
-
-#include "caffe/layers/silence_layer.hpp"
-#include "caffe/util/math_functions.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void SilenceLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  // Do nothing.
-}
-
-template <typename Dtype>
-void SilenceLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  for (int i = 0; i < bottom.size(); ++i) {
-    if (propagate_down[i]) {
-      caffe_gpu_set(bottom[i]->count(), Dtype(0),
-                    bottom[i]->mutable_gpu_diff());
-    }
-  }
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(SilenceLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/slice_layer.cpp b/src/caffe/layers/slice_layer.cpp
index 759beaf..c47a44a 100644
--- a/src/caffe/layers/slice_layer.cpp
+++ b/src/caffe/layers/slice_layer.cpp
@@ -114,10 +114,6 @@ void SliceLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
-#ifdef CPU_ONLY
-STUB_GPU(SliceLayer);
-#endif
-
 INSTANTIATE_CLASS(SliceLayer);
 REGISTER_LAYER_CLASS(Slice);
 
diff --git a/src/caffe/layers/slice_layer.cu b/src/caffe/layers/slice_layer.cu
deleted file mode 100644
index 1be3a79..0000000
--- a/src/caffe/layers/slice_layer.cu
+++ /dev/null
@@ -1,71 +0,0 @@
-#include <vector>
-
-#include "caffe/layers/slice_layer.hpp"
-#include "caffe/util/math_functions.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-__global__ void Slice(const int nthreads, const Dtype* in_data,
-    const bool forward, const int num_slices, const int slice_size,
-    const int bottom_slice_axis, const int top_slice_axis,
-    const int offset_slice_axis, Dtype* out_data) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    const int total_slice_size = slice_size * top_slice_axis;
-    const int slice_num = index / total_slice_size;
-    const int slice_index = index % total_slice_size;
-    const int bottom_index = slice_index +
-        (slice_num * bottom_slice_axis + offset_slice_axis) * slice_size;
-    if (forward) {
-      out_data[index] = in_data[bottom_index];
-    } else {
-      out_data[bottom_index] = in_data[index];
-    }
-  }
-}
-
-template <typename Dtype>
-void SliceLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  if (top.size() == 1) { return; }
-  int offset_slice_axis = 0;
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  const int bottom_slice_axis = bottom[0]->shape(slice_axis_);
-  const bool kForward = true;
-  for (int i = 0; i < top.size(); ++i) {
-    Dtype* top_data = top[i]->mutable_gpu_data();
-    const int top_slice_axis = top[i]->shape(slice_axis_);
-    const int top_slice_size = top_slice_axis * slice_size_;
-    const int nthreads = top_slice_size * num_slices_;
-    Slice<Dtype>  // NOLINT_NEXT_LINE(whitespace/operators)
-        <<<CAFFE_GET_BLOCKS(nthreads), CAFFE_CUDA_NUM_THREADS>>>(
-        nthreads, bottom_data, kForward, num_slices_, slice_size_,
-        bottom_slice_axis, top_slice_axis, offset_slice_axis, top_data);
-    offset_slice_axis += top_slice_axis;
-  }
-}
-
-template <typename Dtype>
-void SliceLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  if (!propagate_down[0] || top.size() == 1) { return; }
-  int offset_slice_axis = 0;
-  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-  const int bottom_slice_axis = bottom[0]->shape(slice_axis_);
-  const bool kForward = false;
-  for (int i = 0; i < top.size(); ++i) {
-    const Dtype* top_diff = top[i]->gpu_diff();
-    const int top_slice_axis = top[i]->shape(slice_axis_);
-    const int top_slice_size = top_slice_axis * slice_size_;
-    const int nthreads = top_slice_size * num_slices_;
-    Slice<Dtype>  // NOLINT_NEXT_LINE(whitespace/operators)
-        <<<CAFFE_GET_BLOCKS(nthreads), CAFFE_CUDA_NUM_THREADS>>>(
-        nthreads, top_diff, kForward, num_slices_, slice_size_,
-        bottom_slice_axis, top_slice_axis, offset_slice_axis, bottom_diff);
-    offset_slice_axis += top_slice_axis;
-  }
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(SliceLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/softmax_layer.cpp b/src/caffe/layers/softmax_layer.cpp
index f60e9b0..edb46cd 100644
--- a/src/caffe/layers/softmax_layer.cpp
+++ b/src/caffe/layers/softmax_layer.cpp
@@ -85,11 +85,6 @@ void SoftmaxLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   caffe_mul(top[0]->count(), bottom_diff, top_data, bottom_diff);
 }
 
-
-#ifdef CPU_ONLY
-STUB_GPU(SoftmaxLayer);
-#endif
-
 INSTANTIATE_CLASS(SoftmaxLayer);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/softmax_layer.cu b/src/caffe/layers/softmax_layer.cu
deleted file mode 100644
index 7a9e683..0000000
--- a/src/caffe/layers/softmax_layer.cu
+++ /dev/null
@@ -1,148 +0,0 @@
-#include <algorithm>
-#include <cfloat>
-#include <vector>
-
-#include "thrust/device_vector.h"
-
-#include "caffe/layers/softmax_layer.hpp"
-#include "caffe/util/math_functions.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-__global__ void kernel_channel_max(const int num, const int channels,
-    const int spatial_dim, const Dtype* data, Dtype* out) {
-  CUDA_KERNEL_LOOP(index, num * spatial_dim) {
-    int n = index / spatial_dim;
-    int s = index % spatial_dim;
-    Dtype maxval = -FLT_MAX;
-    for (int c = 0; c < channels; ++c) {
-      maxval = max(data[(n * channels + c) * spatial_dim + s], maxval);
-    }
-    out[index] = maxval;
-  }
-}
-
-template <typename Dtype>
-__global__ void kernel_channel_subtract(const int count,
-    const int num, const int channels,
-    const int spatial_dim, const Dtype* channel_max, Dtype* data) {
-  CUDA_KERNEL_LOOP(index, count) {
-    int n = index / channels / spatial_dim;
-    int s = index % spatial_dim;
-    data[index] -= channel_max[n * spatial_dim + s];
-  }
-}
-
-template <typename Dtype>
-__global__ void kernel_exp(const int count, const Dtype* data, Dtype* out) {
-  CUDA_KERNEL_LOOP(index, count) {
-    out[index] = exp(data[index]);
-  }
-}
-
-template <typename Dtype>
-__global__ void kernel_channel_sum(const int num, const int channels,
-    const int spatial_dim, const Dtype* data, Dtype* channel_sum) {
-  CUDA_KERNEL_LOOP(index, num * spatial_dim) {
-    int n = index / spatial_dim;
-    int s = index % spatial_dim;
-    Dtype sum = 0;
-    for (int c = 0; c < channels; ++c) {
-      sum += data[(n * channels + c) * spatial_dim + s];
-    }
-    channel_sum[index] = sum;
-  }
-}
-
-template <typename Dtype>
-__global__ void kernel_channel_div(const int count,
-    const int num, const int channels,
-    const int spatial_dim, const Dtype* channel_sum, Dtype* data) {
-  CUDA_KERNEL_LOOP(index, count) {
-    int n = index / channels / spatial_dim;
-    int s = index % spatial_dim;
-    data[index] /= channel_sum[n * spatial_dim + s];
-  }
-}
-
-template <typename Dtype>
-__global__ void kernel_channel_dot(const int num, const int channels,
-    const int spatial_dim, const Dtype* data_1, const Dtype* data_2,
-    Dtype* channel_dot) {
-  CUDA_KERNEL_LOOP(index, num * spatial_dim) {
-    int n = index / spatial_dim;
-    int s = index % spatial_dim;
-    Dtype dot = 0;
-    for (int c = 0; c < channels; ++c) {
-      dot += (data_1[(n * channels + c) * spatial_dim + s]
-          * data_2[(n * channels + c) * spatial_dim + s]);
-    }
-    channel_dot[index] = dot;
-  }
-}
-
-template <typename Dtype>
-void SoftmaxLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  Dtype* scale_data = scale_.mutable_gpu_data();
-  int count = bottom[0]->count();
-  int channels = top[0]->shape(softmax_axis_);
-  caffe_copy(count, bottom_data, top_data);
-  // We need to subtract the max to avoid numerical issues, compute the exp,
-  // and then normalize.
-  // compute max
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  kernel_channel_max<Dtype><<<CAFFE_GET_BLOCKS(outer_num_ * inner_num_),
-      CAFFE_CUDA_NUM_THREADS>>>(outer_num_, channels, inner_num_, top_data,
-      scale_data);
-  // subtract
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  kernel_channel_subtract<Dtype><<<CAFFE_GET_BLOCKS(count),
-      CAFFE_CUDA_NUM_THREADS>>>(count, outer_num_, channels, inner_num_,
-      scale_data, top_data);
-  // exponentiate
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  kernel_exp<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-      count, top_data, top_data);
-  // sum after exp
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  kernel_channel_sum<Dtype><<<CAFFE_GET_BLOCKS(outer_num_ * inner_num_),
-      CAFFE_CUDA_NUM_THREADS>>>(outer_num_, channels, inner_num_, top_data,
-      scale_data);
-  // divide
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  kernel_channel_div<Dtype><<<CAFFE_GET_BLOCKS(count),
-      CAFFE_CUDA_NUM_THREADS>>>(count, outer_num_, channels, inner_num_,
-      scale_data, top_data);
-}
-
-template <typename Dtype>
-void SoftmaxLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  const Dtype* top_diff = top[0]->gpu_diff();
-  const Dtype* top_data = top[0]->gpu_data();
-  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-  Dtype* scale_data = scale_.mutable_gpu_data();
-  int count = top[0]->count();
-  int channels = top[0]->shape(softmax_axis_);
-  caffe_copy(count, top_diff, bottom_diff);
-  // Compute inner1d(top_diff, top_data) and subtract them from the bottom diff.
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  kernel_channel_dot<Dtype><<<CAFFE_GET_BLOCKS(outer_num_ * inner_num_),
-      CAFFE_CUDA_NUM_THREADS>>>(outer_num_, channels, inner_num_,
-      top_diff, top_data, scale_data);
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  kernel_channel_subtract<Dtype><<<CAFFE_GET_BLOCKS(count),
-      CAFFE_CUDA_NUM_THREADS>>>(count, outer_num_, channels, inner_num_,
-      scale_data, bottom_diff);
-  // elementwise multiplication
-  caffe_gpu_mul<Dtype>(top[0]->count(), bottom_diff, top_data, bottom_diff);
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(SoftmaxLayer);
-
-
-}  // namespace caffe
diff --git a/src/caffe/layers/softmax_loss_layer.cpp b/src/caffe/layers/softmax_loss_layer.cpp
index dddb760..1cbd184 100644
--- a/src/caffe/layers/softmax_loss_layer.cpp
+++ b/src/caffe/layers/softmax_loss_layer.cpp
@@ -148,10 +148,6 @@ void SoftmaxWithLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
-#ifdef CPU_ONLY
-STUB_GPU(SoftmaxWithLossLayer);
-#endif
-
 INSTANTIATE_CLASS(SoftmaxWithLossLayer);
 REGISTER_LAYER_CLASS(SoftmaxWithLoss);
 
diff --git a/src/caffe/layers/softmax_loss_layer.cu b/src/caffe/layers/softmax_loss_layer.cu
deleted file mode 100644
index 660e1b3..0000000
--- a/src/caffe/layers/softmax_loss_layer.cu
+++ /dev/null
@@ -1,128 +0,0 @@
-#include <algorithm>
-#include <cfloat>
-#include <vector>
-
-#include "caffe/layers/softmax_loss_layer.hpp"
-#include "caffe/util/math_functions.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-__global__ void SoftmaxLossForwardGPU(const int nthreads,
-          const Dtype* prob_data, const Dtype* label, Dtype* loss,
-          const int num, const int dim, const int spatial_dim,
-          const bool has_ignore_label_, const int ignore_label_,
-          Dtype* counts) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    const int n = index / spatial_dim;
-    const int s = index % spatial_dim;
-    const int label_value = static_cast<int>(label[n * spatial_dim + s]);
-    if (has_ignore_label_ && label_value == ignore_label_) {
-      loss[index] = 0;
-      counts[index] = 0;
-    } else {
-      loss[index] = -log(max(prob_data[n * dim + label_value * spatial_dim + s],
-                      Dtype(FLT_MIN)));
-      counts[index] = 1;
-    }
-  }
-}
-
-template <typename Dtype>
-void SoftmaxWithLossLayer<Dtype>::Forward_gpu(
-    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  softmax_layer_->Forward(softmax_bottom_vec_, softmax_top_vec_);
-  const Dtype* prob_data = prob_.gpu_data();
-  const Dtype* label = bottom[1]->gpu_data();
-  const int dim = prob_.count() / outer_num_;
-  const int nthreads = outer_num_ * inner_num_;
-  // Since this memory is not used for anything until it is overwritten
-  // on the backward pass, we use it here to avoid having to allocate new GPU
-  // memory to accumulate intermediate results in the kernel.
-  Dtype* loss_data = bottom[0]->mutable_gpu_diff();
-  // Similarly, this memory is never used elsewhere, and thus we can use it
-  // to avoid having to allocate additional GPU memory.
-  Dtype* counts = prob_.mutable_gpu_diff();
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  SoftmaxLossForwardGPU<Dtype><<<CAFFE_GET_BLOCKS(nthreads),
-      CAFFE_CUDA_NUM_THREADS>>>(nthreads, prob_data, label, loss_data,
-      outer_num_, dim, inner_num_, has_ignore_label_, ignore_label_, counts);
-  Dtype loss;
-  caffe_gpu_asum(nthreads, loss_data, &loss);
-  Dtype valid_count = -1;
-  // Only launch another CUDA kernel if we actually need the count of valid
-  // outputs.
-  if (normalization_ == LossParameter_NormalizationMode_VALID &&
-      has_ignore_label_) {
-    caffe_gpu_asum(nthreads, counts, &valid_count);
-  }
-  top[0]->mutable_cpu_data()[0] = loss / get_normalizer(normalization_,
-                                                        valid_count);
-  if (top.size() == 2) {
-    top[1]->ShareData(prob_);
-  }
-}
-
-template <typename Dtype>
-__global__ void SoftmaxLossBackwardGPU(const int nthreads, const Dtype* top,
-          const Dtype* label, Dtype* bottom_diff, const int num, const int dim,
-          const int spatial_dim, const bool has_ignore_label_,
-          const int ignore_label_, Dtype* counts) {
-  const int channels = dim / spatial_dim;
-
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    const int n = index / spatial_dim;
-    const int s = index % spatial_dim;
-    const int label_value = static_cast<int>(label[n * spatial_dim + s]);
-
-    if (has_ignore_label_ && label_value == ignore_label_) {
-      for (int c = 0; c < channels; ++c) {
-        bottom_diff[n * dim + c * spatial_dim + s] = 0;
-      }
-      counts[index] = 0;
-    } else {
-      bottom_diff[n * dim + label_value * spatial_dim + s] -= 1;
-      counts[index] = 1;
-    }
-  }
-}
-
-template <typename Dtype>
-void SoftmaxWithLossLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  if (propagate_down[1]) {
-    LOG(FATAL) << this->type()
-               << " Layer cannot backpropagate to label inputs.";
-  }
-  if (propagate_down[0]) {
-    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-    const Dtype* prob_data = prob_.gpu_data();
-    const Dtype* top_data = top[0]->gpu_data();
-    caffe_gpu_memcpy(prob_.count() * sizeof(Dtype), prob_data, bottom_diff);
-    const Dtype* label = bottom[1]->gpu_data();
-    const int dim = prob_.count() / outer_num_;
-    const int nthreads = outer_num_ * inner_num_;
-    // Since this memory is never used for anything else,
-    // we use to to avoid allocating new GPU memory.
-    Dtype* counts = prob_.mutable_gpu_diff();
-    // NOLINT_NEXT_LINE(whitespace/operators)
-    SoftmaxLossBackwardGPU<Dtype><<<CAFFE_GET_BLOCKS(nthreads),
-        CAFFE_CUDA_NUM_THREADS>>>(nthreads, top_data, label, bottom_diff,
-        outer_num_, dim, inner_num_, has_ignore_label_, ignore_label_, counts);
-
-    Dtype valid_count = -1;
-    // Only launch another CUDA kernel if we actually need the count of valid
-    // outputs.
-    if (normalization_ == LossParameter_NormalizationMode_VALID &&
-        has_ignore_label_) {
-      caffe_gpu_asum(nthreads, counts, &valid_count);
-    }
-    const Dtype loss_weight = top[0]->cpu_diff()[0] /
-                              get_normalizer(normalization_, valid_count);
-    caffe_gpu_scal(prob_.count(), loss_weight , bottom_diff);
-  }
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(SoftmaxWithLossLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/split_layer.cpp b/src/caffe/layers/split_layer.cpp
index 1a27a9a..19ee882 100644
--- a/src/caffe/layers/split_layer.cpp
+++ b/src/caffe/layers/split_layer.cpp
@@ -48,11 +48,6 @@ void SplitLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
-
-#ifdef CPU_ONLY
-STUB_GPU(SplitLayer);
-#endif
-
 INSTANTIATE_CLASS(SplitLayer);
 REGISTER_LAYER_CLASS(Split);
 
diff --git a/src/caffe/layers/split_layer.cu b/src/caffe/layers/split_layer.cu
deleted file mode 100644
index bec9987..0000000
--- a/src/caffe/layers/split_layer.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-#include <vector>
-
-#include "caffe/layers/split_layer.hpp"
-#include "caffe/util/math_functions.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void SplitLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  for (int i = 0; i < top.size(); ++i) {
-    top[i]->ShareData(*bottom[0]);
-  }
-}
-
-template <typename Dtype>
-void SplitLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  if (!propagate_down[0]) { return; }
-  if (top.size() == 1) {
-    caffe_copy(count_, top[0]->gpu_diff(), bottom[0]->mutable_gpu_diff());
-    return;
-  }
-  caffe_gpu_add(count_, top[0]->gpu_diff(), top[1]->gpu_diff(),
-                bottom[0]->mutable_gpu_diff());
-  // Add remaining top blob diffs.
-  for (int i = 2; i < top.size(); ++i) {
-    const Dtype* top_diff = top[i]->gpu_diff();
-    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-    caffe_gpu_axpy(count_, Dtype(1.), top_diff, bottom_diff);
-  }
-}
-
-
-INSTANTIATE_LAYER_GPU_FUNCS(SplitLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/tanh_layer.cpp b/src/caffe/layers/tanh_layer.cpp
index 184e926..a90291b 100644
--- a/src/caffe/layers/tanh_layer.cpp
+++ b/src/caffe/layers/tanh_layer.cpp
@@ -35,10 +35,6 @@ void TanHLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
-#ifdef CPU_ONLY
-STUB_GPU(TanHLayer);
-#endif
-
 INSTANTIATE_CLASS(TanHLayer);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/tanh_layer.cu b/src/caffe/layers/tanh_layer.cu
deleted file mode 100644
index cbfc178..0000000
--- a/src/caffe/layers/tanh_layer.cu
+++ /dev/null
@@ -1,57 +0,0 @@
-// TanH neuron activation function layer.
-// Adapted from ReLU layer code written by Yangqing Jia
-
-#include <vector>
-
-#include "caffe/layers/tanh_layer.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-__global__ void TanHForward(const int n, const Dtype* in, Dtype* out) {
-  CUDA_KERNEL_LOOP(index, n) {
-    out[index] = tanh(in[index]);
-  }
-}
-
-template <typename Dtype>
-void TanHLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  const int count = bottom[0]->count();
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  TanHForward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-      count, bottom_data, top_data);
-  CUDA_POST_KERNEL_CHECK;
-}
-
-template <typename Dtype>
-__global__ void TanHBackward(const int n, const Dtype* in_diff,
-    const Dtype* out_data, Dtype* out_diff) {
-  CUDA_KERNEL_LOOP(index, n) {
-    Dtype tanhx = out_data[index];
-    out_diff[index] = in_diff[index] * (1 - tanhx * tanhx);
-  }
-}
-
-template <typename Dtype>
-void TanHLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
-  if (propagate_down[0]) {
-    const Dtype* top_data = top[0]->gpu_data();
-    const Dtype* top_diff = top[0]->gpu_diff();
-    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-    const int count = bottom[0]->count();
-    // NOLINT_NEXT_LINE(whitespace/operators)
-    TanHBackward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-        count, top_diff, top_data, bottom_diff);
-    CUDA_POST_KERNEL_CHECK;
-  }
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(TanHLayer);
-
-
-}  // namespace caffe
diff --git a/src/caffe/layers/threshold_layer.cpp b/src/caffe/layers/threshold_layer.cpp
index 63822ee..f1ff1b2 100644
--- a/src/caffe/layers/threshold_layer.cpp
+++ b/src/caffe/layers/threshold_layer.cpp
@@ -22,10 +22,6 @@ void ThresholdLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
   }
 }
 
-#ifdef CPU_ONLY
-STUB_GPU_FORWARD(ThresholdLayer, Forward);
-#endif
-
 INSTANTIATE_CLASS(ThresholdLayer);
 REGISTER_LAYER_CLASS(Threshold);
 
diff --git a/src/caffe/layers/threshold_layer.cu b/src/caffe/layers/threshold_layer.cu
deleted file mode 100644
index b0b0665..0000000
--- a/src/caffe/layers/threshold_layer.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-#include <vector>
-
-#include "caffe/layers/threshold_layer.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-__global__ void ThresholdForward(const int n, const Dtype threshold,
-    const Dtype* in, Dtype* out) {
-  CUDA_KERNEL_LOOP(index, n) {
-    out[index] = in[index] > threshold ? 1 : 0;
-  }
-}
-
-template <typename Dtype>
-void ThresholdLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  const int count = bottom[0]->count();
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  ThresholdForward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-      count, threshold_, bottom_data, top_data);
-  CUDA_POST_KERNEL_CHECK;
-}
-
-
-INSTANTIATE_LAYER_GPU_FORWARD(ThresholdLayer);
-
-
-}  // namespace caffe
diff --git a/src/caffe/layers/tile_layer.cpp b/src/caffe/layers/tile_layer.cpp
index cf0c187..269e97a 100644
--- a/src/caffe/layers/tile_layer.cpp
+++ b/src/caffe/layers/tile_layer.cpp
@@ -51,10 +51,6 @@ void TileLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
-#ifdef CPU_ONLY
-STUB_GPU(TileLayer);
-#endif
-
 INSTANTIATE_CLASS(TileLayer);
 REGISTER_LAYER_CLASS(Tile);
 
diff --git a/src/caffe/layers/tile_layer.cu b/src/caffe/layers/tile_layer.cu
deleted file mode 100644
index 282049e..0000000
--- a/src/caffe/layers/tile_layer.cu
+++ /dev/null
@@ -1,66 +0,0 @@
-#include <vector>
-
-#include "caffe/layers/tile_layer.hpp"
-#include "caffe/util/math_functions.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-__global__ void Tile(const int nthreads, const Dtype* bottom_data,
-    const int tile_size, const int num_tiles, const int bottom_tile_axis,
-    Dtype* top_data) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    const int d = index % tile_size;
-    const int b = (index / tile_size / num_tiles) % bottom_tile_axis;
-    const int n = index / tile_size / num_tiles / bottom_tile_axis;
-    const int bottom_index = (n * bottom_tile_axis + b) * tile_size + d;
-    top_data[index] = bottom_data[bottom_index];
-  }
-}
-
-template <typename Dtype>
-void TileLayer<Dtype>::Forward_gpu(
-    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  const int bottom_tile_axis = bottom[0]->shape(axis_);
-  const int nthreads = top[0]->count();
-  Tile<Dtype>  // NOLINT_NEXT_LINE(whitespace/operators)
-      <<<CAFFE_GET_BLOCKS(nthreads), CAFFE_CUDA_NUM_THREADS>>>(
-      nthreads, bottom_data, inner_dim_, tiles_, bottom_tile_axis, top_data);
-}
-
-template <typename Dtype>
-__global__ void TileBackward(const int nthreads, const Dtype* top_diff,
-    const int tile_size, const int num_tiles, const int bottom_tile_axis,
-    Dtype* bottom_diff) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    const int d = index % tile_size;
-    const int b = (index / tile_size) % bottom_tile_axis;
-    const int n = index / tile_size / bottom_tile_axis;
-    bottom_diff[index] = 0;
-    int top_index = (n * num_tiles * bottom_tile_axis + b) * tile_size + d;
-    for (int t = 0; t < num_tiles; ++t) {
-      bottom_diff[index] += top_diff[top_index];
-      top_index += bottom_tile_axis * tile_size;
-    }
-  }
-}
-
-template <typename Dtype>
-void TileLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  if (!propagate_down[0]) { return; }
-  const Dtype* top_diff = top[0]->gpu_diff();
-  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-  const int bottom_tile_axis = bottom[0]->shape(axis_);
-  const int tile_size = inner_dim_ / bottom_tile_axis;
-  const int nthreads = bottom[0]->count();
-  TileBackward<Dtype>  // NOLINT_NEXT_LINE(whitespace/operators)
-      <<<CAFFE_GET_BLOCKS(nthreads), CAFFE_CUDA_NUM_THREADS>>>(
-      nthreads, top_diff, tile_size, tiles_, bottom_tile_axis, bottom_diff);
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(TileLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/window_data_layer.cpp b/src/caffe/layers/window_data_layer.cpp
deleted file mode 100644
index 103dd4b..0000000
--- a/src/caffe/layers/window_data_layer.cpp
+++ /dev/null
@@ -1,476 +0,0 @@
-#ifdef USE_OPENCV
-#include <opencv2/highgui/highgui_c.h>
-#include <stdint.h>
-
-#include <algorithm>
-#include <map>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "opencv2/core/core.hpp"
-#include "opencv2/highgui/highgui.hpp"
-#include "opencv2/imgproc/imgproc.hpp"
-
-#include "caffe/data_transformer.hpp"
-#include "caffe/internal_thread.hpp"
-#include "caffe/layers/base_data_layer.hpp"
-#include "caffe/layers/window_data_layer.hpp"
-#include "caffe/util/benchmark.hpp"
-#include "caffe/util/io.hpp"
-#include "caffe/util/math_functions.hpp"
-#include "caffe/util/rng.hpp"
-
-// caffe.proto > LayerParameter > WindowDataParameter
-//   'source' field specifies the window_file
-//   'crop_size' indicates the desired warped size
-
-namespace caffe {
-
-template <typename Dtype>
-WindowDataLayer<Dtype>::~WindowDataLayer<Dtype>() {
-  this->StopInternalThread();
-}
-
-template <typename Dtype>
-void WindowDataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  // LayerSetUp runs through the window_file and creates two structures
-  // that hold windows: one for foreground (object) windows and one
-  // for background (non-object) windows. We use an overlap threshold
-  // to decide which is which.
-
-  // window_file format
-  // repeated:
-  //    # image_index
-  //    img_path (abs path)
-  //    channels
-  //    height
-  //    width
-  //    num_windows
-  //    class_index overlap x1 y1 x2 y2
-
-  LOG(INFO) << "Window data layer:" << std::endl
-      << "  foreground (object) overlap threshold: "
-      << this->layer_param_.window_data_param().fg_threshold() << std::endl
-      << "  background (non-object) overlap threshold: "
-      << this->layer_param_.window_data_param().bg_threshold() << std::endl
-      << "  foreground sampling fraction: "
-      << this->layer_param_.window_data_param().fg_fraction() << std::endl
-      << "  cache_images: "
-      << this->layer_param_.window_data_param().cache_images() << std::endl
-      << "  root_folder: "
-      << this->layer_param_.window_data_param().root_folder();
-
-  cache_images_ = this->layer_param_.window_data_param().cache_images();
-  string root_folder = this->layer_param_.window_data_param().root_folder();
-
-  const bool prefetch_needs_rand =
-      this->transform_param_.mirror() ||
-      this->transform_param_.crop_size();
-  if (prefetch_needs_rand) {
-    const unsigned int prefetch_rng_seed = caffe_rng_rand();
-    prefetch_rng_.reset(new Caffe::RNG(prefetch_rng_seed));
-  } else {
-    prefetch_rng_.reset();
-  }
-
-  std::ifstream infile(this->layer_param_.window_data_param().source().c_str());
-  CHECK(infile.good()) << "Failed to open window file "
-      << this->layer_param_.window_data_param().source() << std::endl;
-
-  map<int, int> label_hist;
-  label_hist.insert(std::make_pair(0, 0));
-
-  string hashtag;
-  int image_index, channels;
-  if (!(infile >> hashtag >> image_index)) {
-    LOG(FATAL) << "Window file is empty";
-  }
-  do {
-    CHECK_EQ(hashtag, "#");
-    // read image path
-    string image_path;
-    infile >> image_path;
-    image_path = root_folder + image_path;
-    // read image dimensions
-    vector<int> image_size(3);
-    infile >> image_size[0] >> image_size[1] >> image_size[2];
-    channels = image_size[0];
-    image_database_.push_back(std::make_pair(image_path, image_size));
-
-    if (cache_images_) {
-      Datum datum;
-      if (!ReadFileToDatum(image_path, &datum)) {
-        LOG(ERROR) << "Could not open or find file " << image_path;
-        return;
-      }
-      image_database_cache_.push_back(std::make_pair(image_path, datum));
-    }
-    // read each box
-    int num_windows;
-    infile >> num_windows;
-    const float fg_threshold =
-        this->layer_param_.window_data_param().fg_threshold();
-    const float bg_threshold =
-        this->layer_param_.window_data_param().bg_threshold();
-    for (int i = 0; i < num_windows; ++i) {
-      int label, x1, y1, x2, y2;
-      float overlap;
-      infile >> label >> overlap >> x1 >> y1 >> x2 >> y2;
-
-      vector<float> window(WindowDataLayer::NUM);
-      window[WindowDataLayer::IMAGE_INDEX] = image_index;
-      window[WindowDataLayer::LABEL] = label;
-      window[WindowDataLayer::OVERLAP] = overlap;
-      window[WindowDataLayer::X1] = x1;
-      window[WindowDataLayer::Y1] = y1;
-      window[WindowDataLayer::X2] = x2;
-      window[WindowDataLayer::Y2] = y2;
-
-      // add window to foreground list or background list
-      if (overlap >= fg_threshold) {
-        int label = window[WindowDataLayer::LABEL];
-        CHECK_GT(label, 0);
-        fg_windows_.push_back(window);
-        label_hist.insert(std::make_pair(label, 0));
-        label_hist[label]++;
-      } else if (overlap < bg_threshold) {
-        // background window, force label and overlap to 0
-        window[WindowDataLayer::LABEL] = 0;
-        window[WindowDataLayer::OVERLAP] = 0;
-        bg_windows_.push_back(window);
-        label_hist[0]++;
-      }
-    }
-
-    if (image_index % 100 == 0) {
-      LOG(INFO) << "num: " << image_index << " "
-          << image_path << " "
-          << image_size[0] << " "
-          << image_size[1] << " "
-          << image_size[2] << " "
-          << "windows to process: " << num_windows;
-    }
-  } while (infile >> hashtag >> image_index);
-
-  LOG(INFO) << "Number of images: " << image_index+1;
-
-  for (map<int, int>::iterator it = label_hist.begin();
-      it != label_hist.end(); ++it) {
-    LOG(INFO) << "class " << it->first << " has " << label_hist[it->first]
-              << " samples";
-  }
-
-  LOG(INFO) << "Amount of context padding: "
-      << this->layer_param_.window_data_param().context_pad();
-
-  LOG(INFO) << "Crop mode: "
-      << this->layer_param_.window_data_param().crop_mode();
-
-  // image
-  const int crop_size = this->transform_param_.crop_size();
-  CHECK_GT(crop_size, 0);
-  const int batch_size = this->layer_param_.window_data_param().batch_size();
-  top[0]->Reshape(batch_size, channels, crop_size, crop_size);
-  for (int i = 0; i < this->PREFETCH_COUNT; ++i)
-    this->prefetch_[i].data_.Reshape(
-        batch_size, channels, crop_size, crop_size);
-
-  LOG(INFO) << "output data size: " << top[0]->num() << ","
-      << top[0]->channels() << "," << top[0]->height() << ","
-      << top[0]->width();
-  // label
-  vector<int> label_shape(1, batch_size);
-  top[1]->Reshape(label_shape);
-  for (int i = 0; i < this->PREFETCH_COUNT; ++i) {
-    this->prefetch_[i].label_.Reshape(label_shape);
-  }
-
-  // data mean
-  has_mean_file_ = this->transform_param_.has_mean_file();
-  has_mean_values_ = this->transform_param_.mean_value_size() > 0;
-  if (has_mean_file_) {
-    const string& mean_file =
-          this->transform_param_.mean_file();
-    LOG(INFO) << "Loading mean file from: " << mean_file;
-    BlobProto blob_proto;
-    ReadProtoFromBinaryFileOrDie(mean_file.c_str(), &blob_proto);
-    data_mean_.FromProto(blob_proto);
-  }
-  if (has_mean_values_) {
-    CHECK(has_mean_file_ == false) <<
-      "Cannot specify mean_file and mean_value at the same time";
-    for (int c = 0; c < this->transform_param_.mean_value_size(); ++c) {
-      mean_values_.push_back(this->transform_param_.mean_value(c));
-    }
-    CHECK(mean_values_.size() == 1 || mean_values_.size() == channels) <<
-     "Specify either 1 mean_value or as many as channels: " << channels;
-    if (channels > 1 && mean_values_.size() == 1) {
-      // Replicate the mean_value for simplicity
-      for (int c = 1; c < channels; ++c) {
-        mean_values_.push_back(mean_values_[0]);
-      }
-    }
-  }
-}
-
-template <typename Dtype>
-unsigned int WindowDataLayer<Dtype>::PrefetchRand() {
-  CHECK(prefetch_rng_);
-  caffe::rng_t* prefetch_rng =
-      static_cast<caffe::rng_t*>(prefetch_rng_->generator());
-  return (*prefetch_rng)();
-}
-
-// This function is called on prefetch thread
-template <typename Dtype>
-void WindowDataLayer<Dtype>::load_batch(Batch<Dtype>* batch) {
-  // At each iteration, sample N windows where N*p are foreground (object)
-  // windows and N*(1-p) are background (non-object) windows
-  CPUTimer batch_timer;
-  batch_timer.Start();
-  double read_time = 0;
-  double trans_time = 0;
-  CPUTimer timer;
-  Dtype* top_data = batch->data_.mutable_cpu_data();
-  Dtype* top_label = batch->label_.mutable_cpu_data();
-  const Dtype scale = this->layer_param_.window_data_param().scale();
-  const int batch_size = this->layer_param_.window_data_param().batch_size();
-  const int context_pad = this->layer_param_.window_data_param().context_pad();
-  const int crop_size = this->transform_param_.crop_size();
-  const bool mirror = this->transform_param_.mirror();
-  const float fg_fraction =
-      this->layer_param_.window_data_param().fg_fraction();
-  Dtype* mean = NULL;
-  int mean_off = 0;
-  int mean_width = 0;
-  int mean_height = 0;
-  if (this->has_mean_file_) {
-    mean = this->data_mean_.mutable_cpu_data();
-    mean_off = (this->data_mean_.width() - crop_size) / 2;
-    mean_width = this->data_mean_.width();
-    mean_height = this->data_mean_.height();
-  }
-  cv::Size cv_crop_size(crop_size, crop_size);
-  const string& crop_mode = this->layer_param_.window_data_param().crop_mode();
-
-  bool use_square = (crop_mode == "square") ? true : false;
-
-  // zero out batch
-  caffe_set(batch->data_.count(), Dtype(0), top_data);
-
-  const int num_fg = static_cast<int>(static_cast<float>(batch_size)
-      * fg_fraction);
-  const int num_samples[2] = { batch_size - num_fg, num_fg };
-
-  int item_id = 0;
-  CHECK_GT(fg_windows_.size(), 0);
-  CHECK_GT(bg_windows_.size(), 0);
-
-  // sample from bg set then fg set
-  for (int is_fg = 0; is_fg < 2; ++is_fg) {
-    for (int dummy = 0; dummy < num_samples[is_fg]; ++dummy) {
-      // sample a window
-      timer.Start();
-      const unsigned int rand_index = PrefetchRand();
-      vector<float> window = (is_fg) ?
-          fg_windows_[rand_index % fg_windows_.size()] :
-          bg_windows_[rand_index % bg_windows_.size()];
-
-      bool do_mirror = mirror && PrefetchRand() % 2;
-
-      // load the image containing the window
-      pair<std::string, vector<int> > image =
-          image_database_[window[WindowDataLayer<Dtype>::IMAGE_INDEX]];
-
-      cv::Mat cv_img;
-      if (this->cache_images_) {
-        pair<std::string, Datum> image_cached =
-          image_database_cache_[window[WindowDataLayer<Dtype>::IMAGE_INDEX]];
-        cv_img = DecodeDatumToCVMat(image_cached.second, true);
-      } else {
-        cv_img = cv::imread(image.first, CV_LOAD_IMAGE_COLOR);
-        if (!cv_img.data) {
-          LOG(ERROR) << "Could not open or find file " << image.first;
-          return;
-        }
-      }
-      read_time += timer.MicroSeconds();
-      timer.Start();
-      const int channels = cv_img.channels();
-
-      // crop window out of image and warp it
-      int x1 = window[WindowDataLayer<Dtype>::X1];
-      int y1 = window[WindowDataLayer<Dtype>::Y1];
-      int x2 = window[WindowDataLayer<Dtype>::X2];
-      int y2 = window[WindowDataLayer<Dtype>::Y2];
-
-      int pad_w = 0;
-      int pad_h = 0;
-      if (context_pad > 0 || use_square) {
-        // scale factor by which to expand the original region
-        // such that after warping the expanded region to crop_size x crop_size
-        // there's exactly context_pad amount of padding on each side
-        Dtype context_scale = static_cast<Dtype>(crop_size) /
-            static_cast<Dtype>(crop_size - 2*context_pad);
-
-        // compute the expanded region
-        Dtype half_height = static_cast<Dtype>(y2-y1+1)/2.0;
-        Dtype half_width = static_cast<Dtype>(x2-x1+1)/2.0;
-        Dtype center_x = static_cast<Dtype>(x1) + half_width;
-        Dtype center_y = static_cast<Dtype>(y1) + half_height;
-        if (use_square) {
-          if (half_height > half_width) {
-            half_width = half_height;
-          } else {
-            half_height = half_width;
-          }
-        }
-        x1 = static_cast<int>(round(center_x - half_width*context_scale));
-        x2 = static_cast<int>(round(center_x + half_width*context_scale));
-        y1 = static_cast<int>(round(center_y - half_height*context_scale));
-        y2 = static_cast<int>(round(center_y + half_height*context_scale));
-
-        // the expanded region may go outside of the image
-        // so we compute the clipped (expanded) region and keep track of
-        // the extent beyond the image
-        int unclipped_height = y2-y1+1;
-        int unclipped_width = x2-x1+1;
-        int pad_x1 = std::max(0, -x1);
-        int pad_y1 = std::max(0, -y1);
-        int pad_x2 = std::max(0, x2 - cv_img.cols + 1);
-        int pad_y2 = std::max(0, y2 - cv_img.rows + 1);
-        // clip bounds
-        x1 = x1 + pad_x1;
-        x2 = x2 - pad_x2;
-        y1 = y1 + pad_y1;
-        y2 = y2 - pad_y2;
-        CHECK_GT(x1, -1);
-        CHECK_GT(y1, -1);
-        CHECK_LT(x2, cv_img.cols);
-        CHECK_LT(y2, cv_img.rows);
-
-        int clipped_height = y2-y1+1;
-        int clipped_width = x2-x1+1;
-
-        // scale factors that would be used to warp the unclipped
-        // expanded region
-        Dtype scale_x =
-            static_cast<Dtype>(crop_size)/static_cast<Dtype>(unclipped_width);
-        Dtype scale_y =
-            static_cast<Dtype>(crop_size)/static_cast<Dtype>(unclipped_height);
-
-        // size to warp the clipped expanded region to
-        cv_crop_size.width =
-            static_cast<int>(round(static_cast<Dtype>(clipped_width)*scale_x));
-        cv_crop_size.height =
-            static_cast<int>(round(static_cast<Dtype>(clipped_height)*scale_y));
-        pad_x1 = static_cast<int>(round(static_cast<Dtype>(pad_x1)*scale_x));
-        pad_x2 = static_cast<int>(round(static_cast<Dtype>(pad_x2)*scale_x));
-        pad_y1 = static_cast<int>(round(static_cast<Dtype>(pad_y1)*scale_y));
-        pad_y2 = static_cast<int>(round(static_cast<Dtype>(pad_y2)*scale_y));
-
-        pad_h = pad_y1;
-        // if we're mirroring, we mirror the padding too (to be pedantic)
-        if (do_mirror) {
-          pad_w = pad_x2;
-        } else {
-          pad_w = pad_x1;
-        }
-
-        // ensure that the warped, clipped region plus the padding fits in the
-        // crop_size x crop_size image (it might not due to rounding)
-        if (pad_h + cv_crop_size.height > crop_size) {
-          cv_crop_size.height = crop_size - pad_h;
-        }
-        if (pad_w + cv_crop_size.width > crop_size) {
-          cv_crop_size.width = crop_size - pad_w;
-        }
-      }
-
-      cv::Rect roi(x1, y1, x2-x1+1, y2-y1+1);
-      cv::Mat cv_cropped_img = cv_img(roi);
-      cv::resize(cv_cropped_img, cv_cropped_img,
-          cv_crop_size, 0, 0, cv::INTER_LINEAR);
-
-      // horizontal flip at random
-      if (do_mirror) {
-        cv::flip(cv_cropped_img, cv_cropped_img, 1);
-      }
-
-      // copy the warped window into top_data
-      for (int h = 0; h < cv_cropped_img.rows; ++h) {
-        const uchar* ptr = cv_cropped_img.ptr<uchar>(h);
-        int img_index = 0;
-        for (int w = 0; w < cv_cropped_img.cols; ++w) {
-          for (int c = 0; c < channels; ++c) {
-            int top_index = ((item_id * channels + c) * crop_size + h + pad_h)
-                     * crop_size + w + pad_w;
-            // int top_index = (c * height + h) * width + w;
-            Dtype pixel = static_cast<Dtype>(ptr[img_index++]);
-            if (this->has_mean_file_) {
-              int mean_index = (c * mean_height + h + mean_off + pad_h)
-                           * mean_width + w + mean_off + pad_w;
-              top_data[top_index] = (pixel - mean[mean_index]) * scale;
-            } else {
-              if (this->has_mean_values_) {
-                top_data[top_index] = (pixel - this->mean_values_[c]) * scale;
-              } else {
-                top_data[top_index] = pixel * scale;
-              }
-            }
-          }
-        }
-      }
-      trans_time += timer.MicroSeconds();
-      // get window label
-      top_label[item_id] = window[WindowDataLayer<Dtype>::LABEL];
-
-      #if 0
-      // useful debugging code for dumping transformed windows to disk
-      string file_id;
-      std::stringstream ss;
-      ss << PrefetchRand();
-      ss >> file_id;
-      std::ofstream inf((string("dump/") + file_id +
-          string("_info.txt")).c_str(), std::ofstream::out);
-      inf << image.first << std::endl
-          << window[WindowDataLayer<Dtype>::X1]+1 << std::endl
-          << window[WindowDataLayer<Dtype>::Y1]+1 << std::endl
-          << window[WindowDataLayer<Dtype>::X2]+1 << std::endl
-          << window[WindowDataLayer<Dtype>::Y2]+1 << std::endl
-          << do_mirror << std::endl
-          << top_label[item_id] << std::endl
-          << is_fg << std::endl;
-      inf.close();
-      std::ofstream top_data_file((string("dump/") + file_id +
-          string("_data.txt")).c_str(),
-          std::ofstream::out | std::ofstream::binary);
-      for (int c = 0; c < channels; ++c) {
-        for (int h = 0; h < crop_size; ++h) {
-          for (int w = 0; w < crop_size; ++w) {
-            top_data_file.write(reinterpret_cast<char*>(
-                &top_data[((item_id * channels + c) * crop_size + h)
-                          * crop_size + w]),
-                sizeof(Dtype));
-          }
-        }
-      }
-      top_data_file.close();
-      #endif
-
-      item_id++;
-    }
-  }
-  batch_timer.Stop();
-  DLOG(INFO) << "Prefetch batch: " << batch_timer.MilliSeconds() << " ms.";
-  DLOG(INFO) << "     Read time: " << read_time / 1000 << " ms.";
-  DLOG(INFO) << "Transform time: " << trans_time / 1000 << " ms.";
-}
-
-INSTANTIATE_CLASS(WindowDataLayer);
-REGISTER_LAYER_CLASS(WindowData);
-
-}  // namespace caffe
-#endif  // USE_OPENCV
diff --git a/src/caffe/net.cpp b/src/caffe/net.cpp
index 59f12c6..e606b46 100644
--- a/src/caffe/net.cpp
+++ b/src/caffe/net.cpp
@@ -8,7 +8,6 @@
 #include "caffe/common.hpp"
 #include "caffe/layer.hpp"
 #include "caffe/net.hpp"
-#include "caffe/parallel.hpp"
 #include "caffe/proto/caffe.pb.h"
 #include "caffe/util/insert_splits.hpp"
 #include "caffe/util/math_functions.hpp"
@@ -33,17 +32,12 @@ Net<Dtype>::Net(const string& param_file, Phase phase, const Net* root_net)
 
 template <typename Dtype>
 void Net<Dtype>::Init(const NetParameter& in_param) {
-  CHECK(Caffe::root_solver() || root_net_)
-      << "root_net_ needs to be set for all non-root solvers";
   // Set phase from the state.
   phase_ = in_param.state().phase();
   // Filter layers based on their include/exclude rules and
   // the current NetState.
   NetParameter filtered_param;
   FilterNet(in_param, &filtered_param);
-  LOG_IF(INFO, Caffe::root_solver())
-      << "Initializing net from parameters: " << std::endl
-      << filtered_param.DebugString();
   // Create a copy of filtered_param with splits added where necessary.
   NetParameter param;
   InsertSplits(filtered_param, &param);
@@ -61,8 +55,7 @@ void Net<Dtype>::Init(const NetParameter& in_param) {
   bottom_need_backward_.resize(param.layer_size());
   for (int layer_id = 0; layer_id < param.layer_size(); ++layer_id) {
     // For non-root solvers, whether this layer is shared from root_net_.
-    bool share_from_root = !Caffe::root_solver()
-        && root_net_->layers_[layer_id]->ShareInParallel();
+    bool share_from_root = !true;
     // Inherit phase from net if unset.
     if (!param.layer(layer_id).has_phase()) {
       param.mutable_layer(layer_id)->set_phase(phase_);
@@ -83,8 +76,6 @@ void Net<Dtype>::Init(const NetParameter& in_param) {
       layers_.push_back(LayerRegistry<Dtype>::CreateLayer(layer_param));
     }
     layer_names_.push_back(layer_param.name());
-    LOG_IF(INFO, Caffe::root_solver())
-        << "Creating Layer " << layer_param.name();
     bool need_backward = false;
 
     // Figure out this layer's input and output
@@ -126,30 +117,17 @@ void Net<Dtype>::Init(const NetParameter& in_param) {
       const vector<Blob<Dtype>*>& this_top = this->top_vecs_[layer_id];
       for (int top_id = 0; top_id < base_top.size(); ++top_id) {
         this_top[top_id]->ReshapeLike(*base_top[top_id]);
-        LOG(INFO) << "Created top blob " << top_id << " (shape: "
-            << this_top[top_id]->shape_string() <<  ") for shared layer "
-            << layer_param.name();
       }
     } else {
       layers_[layer_id]->SetUp(bottom_vecs_[layer_id], top_vecs_[layer_id]);
     }
-    LOG_IF(INFO, Caffe::root_solver())
-        << "Setting up " << layer_names_[layer_id];
     for (int top_id = 0; top_id < top_vecs_[layer_id].size(); ++top_id) {
       if (blob_loss_weights_.size() <= top_id_vecs_[layer_id][top_id]) {
         blob_loss_weights_.resize(top_id_vecs_[layer_id][top_id] + 1, Dtype(0));
       }
       blob_loss_weights_[top_id_vecs_[layer_id][top_id]] = layer->loss(top_id);
-      LOG_IF(INFO, Caffe::root_solver())
-          << "Top shape: " << top_vecs_[layer_id][top_id]->shape_string();
-      if (layer->loss(top_id)) {
-        LOG_IF(INFO, Caffe::root_solver())
-            << "    with loss weight " << layer->loss(top_id);
-      }
       memory_used_ += top_vecs_[layer_id][top_id]->count();
     }
-    LOG_IF(INFO, Caffe::root_solver())
-        << "Memory required for data: " << memory_used_ * sizeof(Dtype);
     const int param_size = layer_param.param_size();
     const int num_param_blobs = layers_[layer_id]->blobs().size();
     CHECK_LE(param_size, num_param_blobs)
@@ -207,14 +185,6 @@ void Net<Dtype>::Init(const NetParameter& in_param) {
       }
     }
     if (!layer_contributes_loss) { layer_need_backward_[layer_id] = false; }
-    if (Caffe::root_solver()) {
-      if (layer_need_backward_[layer_id]) {
-        LOG(INFO) << layer_names_[layer_id] << " needs backward computation.";
-      } else {
-        LOG(INFO) << layer_names_[layer_id]
-            << " does not need backward computation.";
-      }
-    }
     for (int bottom_id = 0; bottom_id < bottom_vecs_[layer_id].size();
          ++bottom_id) {
       if (layer_contributes_loss) {
@@ -253,8 +223,6 @@ void Net<Dtype>::Init(const NetParameter& in_param) {
   // In the end, all remaining blobs are considered output blobs.
   for (set<string>::iterator it = available_blobs.begin();
       it != available_blobs.end(); ++it) {
-    LOG_IF(INFO, Caffe::root_solver())
-        << "This network produces output " << *it;
     net_output_blobs_.push_back(blobs_[blob_name_to_idx[*it]].get());
     net_output_blob_indices_.push_back(blob_name_to_idx[*it]);
   }
@@ -266,7 +234,6 @@ void Net<Dtype>::Init(const NetParameter& in_param) {
   }
   ShareWeights();
   debug_info_ = param.debug_info();
-  LOG_IF(INFO, Caffe::root_solver()) << "Network initialization done.";
 }
 
 template <typename Dtype>
@@ -305,30 +272,18 @@ bool Net<Dtype>::StateMeetsRule(const NetState& state,
   // Check whether the rule is broken due to phase.
   if (rule.has_phase()) {
       if (rule.phase() != state.phase()) {
-        LOG_IF(INFO, Caffe::root_solver())
-            << "The NetState phase (" << state.phase()
-            << ") differed from the phase (" << rule.phase()
-            << ") specified by a rule in layer " << layer_name;
         return false;
       }
   }
   // Check whether the rule is broken due to min level.
   if (rule.has_min_level()) {
     if (state.level() < rule.min_level()) {
-      LOG_IF(INFO, Caffe::root_solver())
-          << "The NetState level (" << state.level()
-          << ") is above the min_level (" << rule.min_level()
-          << ") specified by a rule in layer " << layer_name;
       return false;
     }
   }
   // Check whether the rule is broken due to max level.
   if (rule.has_max_level()) {
     if (state.level() > rule.max_level()) {
-      LOG_IF(INFO, Caffe::root_solver())
-          << "The NetState level (" << state.level()
-          << ") is above the max_level (" << rule.max_level()
-          << ") specified by a rule in layer " << layer_name;
       return false;
     }
   }
@@ -341,9 +296,6 @@ bool Net<Dtype>::StateMeetsRule(const NetState& state,
       if (rule.stage(i) == state.stage(j)) { has_stage = true; }
     }
     if (!has_stage) {
-      LOG_IF(INFO, Caffe::root_solver())
-          << "The NetState did not contain stage '" << rule.stage(i)
-          << "' specified by a rule in layer " << layer_name;
       return false;
     }
   }
@@ -356,9 +308,6 @@ bool Net<Dtype>::StateMeetsRule(const NetState& state,
       if (rule.not_stage(i) == state.stage(j)) { has_stage = true; }
     }
     if (has_stage) {
-      LOG_IF(INFO, Caffe::root_solver())
-          << "The NetState contained a not_stage '" << rule.not_stage(i)
-          << "' specified by a rule in layer " << layer_name;
       return false;
     }
   }
@@ -378,8 +327,6 @@ void Net<Dtype>::AppendTop(const NetParameter& param, const int layer_id,
   if (blob_name_to_idx && layer_param->bottom_size() > top_id &&
       blob_name == layer_param->bottom(top_id)) {
     // In-place computation
-    LOG_IF(INFO, Caffe::root_solver())
-        << layer_param->name() << " -> " << blob_name << " (in-place)";
     top_vecs_[layer_id].push_back(blobs_[(*blob_name_to_idx)[blob_name]].get());
     top_id_vecs_[layer_id].push_back((*blob_name_to_idx)[blob_name]);
   } else if (blob_name_to_idx &&
@@ -390,9 +337,6 @@ void Net<Dtype>::AppendTop(const NetParameter& param, const int layer_id,
                << "' produced by multiple sources.";
   } else {
     // Normal output.
-    if (Caffe::root_solver()) {
-      LOG(INFO) << layer_param->name() << " -> " << blob_name;
-    }
     shared_ptr<Blob<Dtype> > blob_pointer(new Blob<Dtype>());
     const int blob_id = blobs_.size();
     blobs_.push_back(blob_pointer);
@@ -417,8 +361,6 @@ int Net<Dtype>::AppendBottom(const NetParameter& param, const int layer_id,
                << layer_param.name() << "', bottom index " << bottom_id << ")";
   }
   const int blob_id = (*blob_name_to_idx)[blob_name];
-  LOG_IF(INFO, Caffe::root_solver())
-      << layer_names_[layer_id] << " <- " << blob_name;
   bottom_vecs_[layer_id].push_back(blobs_[blob_id].get());
   bottom_id_vecs_[layer_id].push_back(blob_id);
   available_blobs->erase(blob_name);
@@ -476,10 +418,6 @@ void Net<Dtype>::AppendParam(const NetParameter& param, const int layer_id,
         param_layer_indices_[owner_net_param_id];
     const int owner_layer_id = owner_index.first;
     const int owner_param_id = owner_index.second;
-    LOG_IF(INFO, Caffe::root_solver()) << "Sharing parameters '" << param_name
-        << "' owned by "
-        << "layer '" << layer_names_[owner_layer_id] << "', param "
-        << "index " << owner_param_id;
     Blob<Dtype>* this_blob = layers_[layer_id]->blobs()[param_id].get();
     Blob<Dtype>* owner_blob =
         layers_[owner_layer_id]->blobs()[owner_param_id].get();
@@ -535,7 +473,6 @@ Dtype Net<Dtype>::ForwardFromTo(int start, int end) {
     // LOG(ERROR) << "Forwarding " << layer_names_[i];
     Dtype layer_loss = layers_[i]->Forward(bottom_vecs_[i], top_vecs_[i]);
     loss += layer_loss;
-    if (debug_info_) { ForwardDebugInfo(i); }
   }
   return loss;
 }
@@ -580,90 +517,10 @@ void Net<Dtype>::BackwardFromTo(int start, int end) {
     if (layer_need_backward_[i]) {
       layers_[i]->Backward(
           top_vecs_[i], bottom_need_backward_[i], bottom_vecs_[i]);
-      if (debug_info_) { BackwardDebugInfo(i); }
     }
   }
 }
 
-template <typename Dtype>
-void Net<Dtype>::ForwardDebugInfo(const int layer_id) {
-  for (int top_id = 0; top_id < top_vecs_[layer_id].size(); ++top_id) {
-    const Blob<Dtype>& blob = *top_vecs_[layer_id][top_id];
-    const string& blob_name = blob_names_[top_id_vecs_[layer_id][top_id]];
-    const Dtype data_abs_val_mean = blob.asum_data() / blob.count();
-    LOG_IF(INFO, Caffe::root_solver())
-        << "    [Forward] "
-        << "Layer " << layer_names_[layer_id]
-        << ", top blob " << blob_name
-        << " data: " << data_abs_val_mean;
-  }
-  for (int param_id = 0; param_id < layers_[layer_id]->blobs().size();
-       ++param_id) {
-    const Blob<Dtype>& blob = *layers_[layer_id]->blobs()[param_id];
-    const int net_param_id = param_id_vecs_[layer_id][param_id];
-    const string& blob_name = param_display_names_[net_param_id];
-    const Dtype data_abs_val_mean = blob.asum_data() / blob.count();
-    LOG_IF(INFO, Caffe::root_solver())
-        << "    [Forward] "
-        << "Layer " << layer_names_[layer_id]
-        << ", param blob " << blob_name
-        << " data: " << data_abs_val_mean;
-  }
-}
-
-template <typename Dtype>
-void Net<Dtype>::BackwardDebugInfo(const int layer_id) {
-  const vector<Blob<Dtype>*>& bottom_vec = bottom_vecs_[layer_id];
-  for (int bottom_id = 0; bottom_id < bottom_vec.size(); ++bottom_id) {
-    if (!bottom_need_backward_[layer_id][bottom_id]) { continue; }
-    const Blob<Dtype>& blob = *bottom_vec[bottom_id];
-    const string& blob_name = blob_names_[bottom_id_vecs_[layer_id][bottom_id]];
-    const Dtype diff_abs_val_mean = blob.asum_diff() / blob.count();
-    LOG_IF(INFO, Caffe::root_solver())
-        << "    [Backward] "
-        << "Layer " << layer_names_[layer_id]
-        << ", bottom blob " << blob_name
-        << " diff: " << diff_abs_val_mean;
-  }
-  for (int param_id = 0; param_id < layers_[layer_id]->blobs().size();
-       ++param_id) {
-    if (!layers_[layer_id]->param_propagate_down(param_id)) { continue; }
-    const Blob<Dtype>& blob = *layers_[layer_id]->blobs()[param_id];
-    const Dtype diff_abs_val_mean = blob.asum_diff() / blob.count();
-    LOG_IF(INFO, Caffe::root_solver())
-        << "    [Backward] "
-        << "Layer " << layer_names_[layer_id]
-        << ", param blob " << param_id
-        << " diff: " << diff_abs_val_mean;
-  }
-}
-
-template <typename Dtype>
-void Net<Dtype>::UpdateDebugInfo(const int param_id) {
-  const Blob<Dtype>& blob = *params_[param_id];
-  const int param_owner = param_owners_[param_id];
-  const string& layer_name = layer_names_[param_layer_indices_[param_id].first];
-  const string& param_display_name = param_display_names_[param_id];
-  const Dtype diff_abs_val_mean = blob.asum_diff() / blob.count();
-  if (param_owner < 0) {
-    const Dtype data_abs_val_mean = blob.asum_data() / blob.count();
-    LOG_IF(INFO, Caffe::root_solver())
-        << "    [Update] Layer " << layer_name
-        << ", param " << param_display_name
-        << " data: " << data_abs_val_mean
-        << "; diff: " << diff_abs_val_mean;
-  } else {
-    const string& owner_layer_name =
-        layer_names_[param_layer_indices_[param_owner].first];
-    LOG_IF(INFO, Caffe::root_solver())
-        << "    [Update] Layer " << layer_name
-        << ", param blob " << param_display_name
-        << " (owned by layer " << owner_layer_name << ", " << "param "
-        << param_display_names_[param_owners_[param_id]] << ")"
-        << " diff: " << diff_abs_val_mean;
-  }
-}
-
 template <typename Dtype>
 void Net<Dtype>::ShareTrainedLayersWith(const Net* other) {
   int num_source_layers = other->layers().size();
@@ -676,10 +533,8 @@ void Net<Dtype>::ShareTrainedLayersWith(const Net* other) {
       ++target_layer_id;
     }
     if (target_layer_id == layer_names_.size()) {
-      LOG(INFO) << "Ignoring source layer " << source_layer_name;
       continue;
     }
-    DLOG(INFO) << "Copying source layer " << source_layer_name;
     vector<shared_ptr<Blob<Dtype> > >& target_blobs =
         layers_[target_layer_id]->blobs();
     CHECK_EQ(target_blobs.size(), source_layer->blobs().size())
@@ -744,10 +599,8 @@ void Net<Dtype>::CopyTrainedLayersFrom(const NetParameter& param) {
       ++target_layer_id;
     }
     if (target_layer_id == layer_names_.size()) {
-      LOG(INFO) << "Ignoring source layer " << source_layer_name;
       continue;
     }
-    DLOG(INFO) << "Copying source layer " << source_layer_name;
     vector<shared_ptr<Blob<Dtype> > >& target_blobs =
         layers_[target_layer_id]->blobs();
     CHECK_EQ(target_blobs.size(), source_layer.blobs_size())
@@ -772,12 +625,7 @@ void Net<Dtype>::CopyTrainedLayersFrom(const NetParameter& param) {
 
 template <typename Dtype>
 void Net<Dtype>::CopyTrainedLayersFrom(const string trained_filename) {
-  if (trained_filename.size() >= 3 &&
-      trained_filename.compare(trained_filename.size() - 3, 3, ".h5") == 0) {
-    CopyTrainedLayersFromHDF5(trained_filename);
-  } else {
-    CopyTrainedLayersFromBinaryProto(trained_filename);
-  }
+  CopyTrainedLayersFromBinaryProto(trained_filename);
 }
 
 template <typename Dtype>
@@ -788,127 +636,17 @@ void Net<Dtype>::CopyTrainedLayersFromBinaryProto(
   CopyTrainedLayersFrom(param);
 }
 
-template <typename Dtype>
-void Net<Dtype>::CopyTrainedLayersFromHDF5(const string trained_filename) {
-  NOT_IMPLEMENTED;
-  //hid_t file_hid = H5Fopen(trained_filename.c_str(), H5F_ACC_RDONLY,
-  //                         H5P_DEFAULT);
-  //CHECK_GE(file_hid, 0) << "Couldn't open " << trained_filename;
-  //hid_t data_hid = H5Gopen2(file_hid, "data", H5P_DEFAULT);
-  //CHECK_GE(data_hid, 0) << "Error reading weights from " << trained_filename;
-  //int num_layers = hdf5_get_num_links(data_hid);
-  //for (int i = 0; i < num_layers; ++i) {
-  //  string source_layer_name = hdf5_get_name_by_idx(data_hid, i);
-  //  if (!layer_names_index_.count(source_layer_name)) {
-  //    LOG(INFO) << "Ignoring source layer " << source_layer_name;
-  //    continue;
-  //  }
-  //  int target_layer_id = layer_names_index_[source_layer_name];
-  //  DLOG(INFO) << "Copying source layer " << source_layer_name;
-  //  vector<shared_ptr<Blob<Dtype> > >& target_blobs =
-  //      layers_[target_layer_id]->blobs();
-  //  hid_t layer_hid = H5Gopen2(data_hid, source_layer_name.c_str(),
-  //      H5P_DEFAULT);
-  //  CHECK_GE(layer_hid, 0)
-  //      << "Error reading weights from " << trained_filename;
-  //  // Check that source layer doesn't have more params than target layer
-  //  int num_source_params = hdf5_get_num_links(layer_hid);
-  //  CHECK_LE(num_source_params, target_blobs.size())
-  //      << "Incompatible number of blobs for layer " << source_layer_name;
-  //  for (int j = 0; j < target_blobs.size(); ++j) {
-  //    ostringstream oss;
-  //    oss << j;
-  //    string dataset_name = oss.str();
-  //    int target_net_param_id = param_id_vecs_[target_layer_id][j];
-  //    if (!H5Lexists(layer_hid, dataset_name.c_str(), H5P_DEFAULT)) {
-  //      // Target param doesn't exist in source weights...
-  //      if (param_owners_[target_net_param_id] != -1) {
-  //        // ...but it's weight-shared in target, so that's fine.
-  //        continue;
-  //      } else {
-  //        LOG(FATAL) << "Incompatible number of blobs for layer "
-  //            << source_layer_name;
-  //      }
-  //    }
-  //    hdf5_load_nd_dataset(layer_hid, dataset_name.c_str(), 0, kMaxBlobAxes,
-  //        target_blobs[j].get());
-  //  }
-  //  H5Gclose(layer_hid);
-  //}
-  //H5Gclose(data_hid);
-  //H5Fclose(file_hid);
-}
-
 template <typename Dtype>
 void Net<Dtype>::ToProto(NetParameter* param, bool write_diff) const {
   param->Clear();
   param->set_name(name_);
   // Add bottom and top
-  DLOG(INFO) << "Serializing " << layers_.size() << " layers";
   for (int i = 0; i < layers_.size(); ++i) {
     LayerParameter* layer_param = param->add_layer();
     layers_[i]->ToProto(layer_param, write_diff);
   }
 }
 
-template <typename Dtype>
-void Net<Dtype>::ToHDF5(const string& filename, bool write_diff) const {
-  NOT_IMPLEMENTED;
-  //hid_t file_hid = H5Fcreate(filename.c_str(), H5F_ACC_TRUNC, H5P_DEFAULT,
-  //    H5P_DEFAULT);
-  //CHECK_GE(file_hid, 0)
-  //    << "Couldn't open " << filename << " to save weights.";
-  //hid_t data_hid = H5Gcreate2(file_hid, "data", H5P_DEFAULT, H5P_DEFAULT,
-  //    H5P_DEFAULT);
-  //CHECK_GE(data_hid, 0) << "Error saving weights to " << filename << ".";
-  //hid_t diff_hid = -1;
-  //if (write_diff) {
-  //  diff_hid = H5Gcreate2(file_hid, "diff", H5P_DEFAULT, H5P_DEFAULT,
-  //      H5P_DEFAULT);
-  //  CHECK_GE(diff_hid, 0) << "Error saving weights to " << filename << ".";
-  //}
-  //for (int layer_id = 0; layer_id < layers_.size(); ++layer_id) {
-  //  const LayerParameter& layer_param = layers_[layer_id]->layer_param();
-  //  string layer_name = layer_param.name();
-  //  hid_t layer_data_hid = H5Gcreate2(data_hid, layer_name.c_str(),
-  //      H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
-  //  CHECK_GE(layer_data_hid, 0)
-  //      << "Error saving weights to " << filename << ".";
-  //  hid_t layer_diff_hid = -1;
-  //  if (write_diff) {
-  //    layer_diff_hid = H5Gcreate2(diff_hid, layer_name.c_str(),
-  //        H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
-  //    CHECK_GE(layer_diff_hid, 0)
-  //        << "Error saving weights to " << filename << ".";
-  //  }
-  //  int num_params = layers_[layer_id]->blobs().size();
-  //  for (int param_id = 0; param_id < num_params; ++param_id) {
-  //    ostringstream dataset_name;
-  //    dataset_name << param_id;
-  //    const int net_param_id = param_id_vecs_[layer_id][param_id];
-  //    if (param_owners_[net_param_id] == -1) {
-  //      // Only save params that own themselves
-  //      hdf5_save_nd_dataset<Dtype>(layer_data_hid, dataset_name.str(),
-  //          *params_[net_param_id]);
-  //    }
-  //    if (write_diff) {
-  //      // Write diffs regardless of weight-sharing
-  //      hdf5_save_nd_dataset<Dtype>(layer_diff_hid, dataset_name.str(),
-  //          *params_[net_param_id], true);
-  //    }
-  //  }
-  //  H5Gclose(layer_data_hid);
-  //  if (write_diff) {
-  //    H5Gclose(layer_diff_hid);
-  //  }
-  //}
-  //H5Gclose(data_hid);
-  //if (write_diff) {
-  //  H5Gclose(diff_hid);
-  //}
-  //H5Fclose(file_hid);
-}
-
 template <typename Dtype>
 void Net<Dtype>::Update() {
   for (int i = 0; i < learnable_params_.size(); ++i) {
@@ -920,20 +658,8 @@ template <typename Dtype>
 void Net<Dtype>::ClearParamDiffs() {
   for (int i = 0; i < learnable_params_.size(); ++i) {
     Blob<Dtype>* blob = learnable_params_[i];
-    switch (Caffe::mode()) {
-    case Caffe::CPU:
-      caffe_set(blob->count(), static_cast<Dtype>(0),
-                blob->mutable_cpu_diff());
-      break;
-    case Caffe::GPU:
-#ifndef CPU_ONLY
-      caffe_gpu_set(blob->count(), static_cast<Dtype>(0),
-                    blob->mutable_gpu_diff());
-#else
-      NO_GPU;
-#endif
-      break;
-    }
+    caffe_set(blob->count(), static_cast<Dtype>(0),
+              blob->mutable_cpu_diff());
   }
 }
 
diff --git a/src/caffe/parallel.cpp b/src/caffe/parallel.cpp
deleted file mode 100644
index 5bc41c6..0000000
--- a/src/caffe/parallel.cpp
+++ /dev/null
@@ -1,443 +0,0 @@
-#ifndef CPU_ONLY
-#include <cuda_runtime.h>
-#endif
-#include <glog/logging.h>
-#include <stdio.h>
-
-#include <sstream>
-#include <string>
-#include <vector>
-
-#include "boost/thread.hpp"
-#include "caffe/caffe.hpp"
-#include "caffe/parallel.hpp"
-
-namespace caffe {
-
-enum Op {
-  copy,
-  replace_cpu,
-  replace_gpu,
-  replace_cpu_diff,
-  replace_gpu_diff
-};
-
-template<typename Dtype>
-static void apply_buffers(const vector<Blob<Dtype>*>& blobs,
-                          Dtype* buffer, size_t total_size, Op op) {
-  Dtype* ptr = buffer;
-  for (int i = 0; i < blobs.size(); ++i) {
-    int size = blobs[i]->count();
-    switch (op) {
-      case copy: {
-        // Init buffer to current values of blobs
-        caffe_copy(size,
-                   reinterpret_cast<const Dtype*>(blobs[i]->data()->cpu_data()),
-                   ptr);
-        break;
-      }
-      case replace_cpu:
-        blobs[i]->data()->set_cpu_data(ptr);
-        break;
-      case replace_gpu:
-        blobs[i]->data()->set_gpu_data(ptr);
-        break;
-      case replace_cpu_diff:
-        blobs[i]->diff()->set_cpu_data(ptr);
-        break;
-      case replace_gpu_diff:
-        blobs[i]->diff()->set_gpu_data(ptr);
-        break;
-    }
-    ptr += size;
-  }
-  // total_size is at least one byte
-  CHECK_EQ(total_size, (ptr == buffer ? 1 : ptr - buffer));
-}
-
-// Buffer size necessary to store given blobs
-template<typename Dtype>
-static size_t total_size(const vector<Blob<Dtype>*>& params) {
-  size_t size = 0;
-  for (int i = 0; i < params.size(); ++i)
-    size += params[i]->count();
-  // Size have at least one byte, otherwise cudaMalloc fails if net has no
-  // learnable parameters.
-  return (size > 0) ? size : 1;
-}
-
-template<typename Dtype>
-Params<Dtype>::Params(shared_ptr<Solver<Dtype> > root_solver)
-    : size_(total_size<Dtype>(root_solver->net()->learnable_params())),
-      data_(),
-      diff_() {
-}
-
-template<typename Dtype>
-GPUParams<Dtype>::GPUParams(shared_ptr<Solver<Dtype> > root_solver, int device)
-    : Params<Dtype>(root_solver) {
-#ifndef CPU_ONLY
-  int initial_device;
-  CUDA_CHECK(cudaGetDevice(&initial_device));
-
-  // Allocate device buffers
-  CUDA_CHECK(cudaSetDevice(device));
-  CUDA_CHECK(cudaMalloc(&data_, size_ * sizeof(Dtype)));
-
-  // Copy blob values
-  const vector<Blob<Dtype>*>& net =
-      root_solver->net()->learnable_params();
-  apply_buffers(net, data_, size_, copy);
-
-  CUDA_CHECK(cudaMalloc(&diff_, size_ * sizeof(Dtype)));
-  caffe_gpu_set(size_, Dtype(0), diff_);
-
-  CUDA_CHECK(cudaSetDevice(initial_device));
-#else
-  NO_GPU;
-#endif
-}
-
-template<typename Dtype>
-GPUParams<Dtype>::~GPUParams() {
-#ifndef CPU_ONLY
-  CUDA_CHECK(cudaFree(data_));
-  CUDA_CHECK(cudaFree(diff_));
-#endif
-}
-
-template<typename Dtype>
-void GPUParams<Dtype>::configure(Solver<Dtype>* solver) const {
-  const vector<Blob<Dtype>*>& net =
-      solver->net()->learnable_params();
-  apply_buffers(net, data_, size_, replace_gpu);
-  apply_buffers(net, diff_, size_, replace_gpu_diff);
-}
-
-void DevicePair::compute(const vector<int> devices, vector<DevicePair>* pairs) {
-#ifndef CPU_ONLY
-  vector<int> remaining(devices);
-
-  // Depth for reduction tree
-  int remaining_depth = static_cast<int>(ceil(log2(remaining.size())));
-
-  // Group GPUs by board
-  for (int d = 0; d < remaining_depth; ++d) {
-    for (int i = 0; i < remaining.size(); ++i) {
-      for (int j = i + 1; j < remaining.size(); ++j) {
-        cudaDeviceProp a, b;
-        CUDA_CHECK(cudaGetDeviceProperties(&a, remaining[i]));
-        CUDA_CHECK(cudaGetDeviceProperties(&b, remaining[j]));
-        if (a.isMultiGpuBoard && b.isMultiGpuBoard) {
-          if (a.multiGpuBoardGroupID == b.multiGpuBoardGroupID) {
-            pairs->push_back(DevicePair(remaining[i], remaining[j]));
-            DLOG(INFO) << "GPU board: " << remaining[i] << ":" << remaining[j];
-            remaining.erase(remaining.begin() + j);
-            break;
-          }
-        }
-      }
-    }
-  }
-  ostringstream s;
-  for (int i = 0; i < remaining.size(); ++i) {
-    s << (i ? ", " : "") << remaining[i];
-  }
-  DLOG(INFO) << "GPUs paired by boards, remaining: " << s.str();
-
-  // Group by P2P accessibility
-  remaining_depth = ceil(log2(remaining.size()));
-  for (int d = 0; d < remaining_depth; ++d) {
-    for (int i = 0; i < remaining.size(); ++i) {
-      for (int j = i + 1; j < remaining.size(); ++j) {
-        int access;
-        CUDA_CHECK(
-            cudaDeviceCanAccessPeer(&access, remaining[i], remaining[j]));
-        if (access) {
-          pairs->push_back(DevicePair(remaining[i], remaining[j]));
-          DLOG(INFO) << "P2P pair: " << remaining[i] << ":" << remaining[j];
-          remaining.erase(remaining.begin() + j);
-          break;
-        }
-      }
-    }
-  }
-  s.str("");
-  for (int i = 0; i < remaining.size(); ++i) {
-    s << (i ? ", " : "") << remaining[i];
-  }
-  DLOG(INFO) << "GPUs paired by P2P access, remaining: " << s.str();
-
-  // Group remaining
-  remaining_depth = ceil(log2(remaining.size()));
-  for (int d = 0; d < remaining_depth; ++d) {
-    for (int i = 0; i < remaining.size(); ++i) {
-      pairs->push_back(DevicePair(remaining[i], remaining[i + 1]));
-      DLOG(INFO) << "Remaining pair: " << remaining[i] << ":"
-                 << remaining[i + 1];
-      remaining.erase(remaining.begin() + i + 1);
-    }
-  }
-
-  // Should only be the parent node remaining
-  CHECK_EQ(remaining.size(), 1);
-
-  pairs->insert(pairs->begin(), DevicePair(-1, remaining[0]));
-
-  CHECK(pairs->size() == devices.size());
-  for (int i = 0; i < pairs->size(); ++i) {
-    CHECK((*pairs)[i].parent() != (*pairs)[i].device());
-    for (int j = i + 1; j < pairs->size(); ++j) {
-      CHECK((*pairs)[i].device() != (*pairs)[j].device());
-    }
-  }
-#else
-  NO_GPU;
-#endif
-}
-
-//
-
-template<typename Dtype>
-P2PSync<Dtype>::P2PSync(shared_ptr<Solver<Dtype> > root_solver,
-                        P2PSync<Dtype>* parent, const SolverParameter& param)
-    : GPUParams<Dtype>(root_solver, param.device_id()),
-      parent_(parent),
-      children_(),
-      queue_(),
-      initial_iter_(root_solver->iter()),
-      solver_() {
-#ifndef CPU_ONLY
-  int initial_device;
-  CUDA_CHECK(cudaGetDevice(&initial_device));
-  const int self = param.device_id();
-  CUDA_CHECK(cudaSetDevice(self));
-
-  if (parent == NULL) {
-    solver_ = root_solver;
-  } else {
-    Caffe::set_root_solver(false);
-    solver_.reset(new WorkerSolver<Dtype>(param, root_solver.get()));
-    Caffe::set_root_solver(true);
-  }
-  this->configure(solver_.get());
-  solver_->add_callback(this);
-
-  if (parent) {
-    // Enable p2p access between devices
-    const int peer = parent->solver_->param().device_id();
-    int access;
-    CUDA_CHECK(cudaDeviceCanAccessPeer(&access, self, peer));
-    if (access) {
-      CUDA_CHECK(cudaDeviceEnablePeerAccess(peer, 0));
-    } else {
-      LOG(INFO)<< "GPU " << self << " does not have p2p access to GPU " << peer;
-    }
-    // Allocate receiving buffer on parent
-    CUDA_CHECK(cudaSetDevice(peer));
-    CUDA_CHECK(cudaMalloc(&parent_grads_, size_ * sizeof(Dtype)));
-    CUDA_CHECK(cudaSetDevice(self));
-  }
-
-  CUDA_CHECK(cudaSetDevice(initial_device));
-#else
-  NO_GPU;
-#endif
-}
-
-template<typename Dtype>
-P2PSync<Dtype>::~P2PSync() {
-#ifndef CPU_ONLY
-  int initial_device;
-  CUDA_CHECK(cudaGetDevice(&initial_device));
-  const int self = solver_->param().device_id();
-  CUDA_CHECK(cudaSetDevice(self));
-
-  if (parent_) {
-    CUDA_CHECK(cudaFree(parent_grads_));
-    const int peer = parent_->solver_->param().device_id();
-    int access;
-    CUDA_CHECK(cudaDeviceCanAccessPeer(&access, self, peer));
-    if (access) {
-      CUDA_CHECK(cudaDeviceDisablePeerAccess(peer));
-    }
-  }
-
-  CUDA_CHECK(cudaSetDevice(initial_device));
-#endif
-}
-
-template<typename Dtype>
-void P2PSync<Dtype>::InternalThreadEntry() {
-  Caffe::SetDevice(solver_->param().device_id());
-  CHECK(Caffe::root_solver());
-  Caffe::set_root_solver(false);
-  // See if there is a defined seed and reset random state if so
-  if (solver_->param().random_seed() >= 0) {
-    // Fetch random seed and modulate by device ID to make sure
-    // everyone doesn't have the same seed.  We seem to have some
-    // solver instability if we have everyone with the same seed
-    Caffe::set_random_seed(
-        solver_->param().random_seed() + solver_->param().device_id());
-  }
-  solver_->Step(solver_->param().max_iter() - initial_iter_);
-}
-
-template<typename Dtype>
-void P2PSync<Dtype>::on_start() {
-#ifndef CPU_ONLY
-#ifdef DEBUG
-  int device;
-  CUDA_CHECK(cudaGetDevice(&device));
-  CHECK(device == solver_->param().device_id());
-#else
-//  CHECK(false);
-#endif
-
-  // Wait for update from parent
-  if (parent_) {
-    P2PSync<Dtype> *parent = queue_.pop();
-    CHECK(parent == parent_);
-  }
-
-  // Update children
-  for (int i = children_.size() - 1; i >= 0; i--) {
-    Dtype* src = data_;
-    Dtype* dst = children_[i]->data_;
-
-#ifdef DEBUG
-    cudaPointerAttributes attributes;
-    CUDA_CHECK(cudaPointerGetAttributes(&attributes, src));
-    CHECK(attributes.device == device);
-    CUDA_CHECK(cudaPointerGetAttributes(&attributes, dst));
-    CHECK(attributes.device == children_[i]->solver_->param().device_id());
-#endif
-
-    CUDA_CHECK(cudaMemcpyAsync(dst, src, size_ * sizeof(Dtype),
-        cudaMemcpyDeviceToDevice, cudaStreamDefault));
-    CUDA_CHECK(cudaStreamSynchronize(cudaStreamDefault));
-    children_[i]->queue_.push(this);
-  }
-#endif
-}
-
-template<typename Dtype>
-void P2PSync<Dtype>::on_gradients_ready() {
-#ifndef CPU_ONLY
-#ifdef DEBUG
-  int device;
-  CUDA_CHECK(cudaGetDevice(&device));
-  CHECK(device == solver_->param().device_id());
-#endif
-
-  // Sum children gradients as they appear in the queue
-  for (int i = 0; i < children_.size(); ++i) {
-    P2PSync<Dtype> *child = queue_.pop();
-    Dtype* src = child->parent_grads_;
-    Dtype* dst = diff_;
-
-#ifdef DEBUG
-    bool ok = false;
-    for (int j = 0; j < children_.size(); ++j) {
-      if (child == children_[j]) {
-        ok = true;
-      }
-    }
-    CHECK(ok);
-    cudaPointerAttributes attributes;
-    CUDA_CHECK(cudaPointerGetAttributes(&attributes, src));
-    CHECK(attributes.device == device);
-    CUDA_CHECK(cudaPointerGetAttributes(&attributes, dst));
-    CHECK(attributes.device == device);
-#endif
-
-    caffe_gpu_add(size_, src, dst, dst);
-  }
-
-  // Send gradients to parent
-  if (parent_) {
-    Dtype* src = diff_;
-    Dtype* dst = parent_grads_;
-
-#ifdef DEBUG
-    cudaPointerAttributes attributes;
-    CUDA_CHECK(cudaPointerGetAttributes(&attributes, src));
-    CHECK(attributes.device == device);
-    CUDA_CHECK(cudaPointerGetAttributes(&attributes, dst));
-    CHECK(attributes.device == parent_->solver_->param().device_id());
-#endif
-
-    CUDA_CHECK(cudaMemcpyAsync(dst, src, size_ * sizeof(Dtype),  //
-        cudaMemcpyDeviceToDevice, cudaStreamDefault));
-    CUDA_CHECK(cudaStreamSynchronize(cudaStreamDefault));
-    parent_->queue_.push(this);
-  } else {
-    // Loss functions divide gradients by the batch size, so to compensate
-    // for split batch, the root solver divides by number of solvers.
-    caffe_gpu_scal(size_, Dtype(1.0 / Caffe::solver_count()), diff_);
-  }
-#endif
-}
-
-template<typename Dtype>
-void P2PSync<Dtype>::Prepare(const vector<int>& gpus,
-            vector<shared_ptr<P2PSync<Dtype> > >* syncs) {
-  // Pair devices for map-reduce synchronization
-  vector<DevicePair> pairs;
-  DevicePair::compute(gpus, &pairs);
-  ostringstream s;
-  for (int i = 1; i < pairs.size(); ++i) {
-    s << (i == 1 ? "" : ", ") << pairs[i].parent() << ":" << pairs[i].device();
-  }
-  LOG(INFO)<< "GPUs pairs " << s.str();
-
-  SolverParameter param(solver_->param());
-
-  // Build the GPU tree by finding the parent for each solver
-  for (int attempts = 0; attempts < pairs.size(); ++attempts) {
-    for (int i = 1; i < pairs.size(); ++i) {
-      if (!syncs->at(i).get()) {
-        P2PSync<Dtype>* parent = NULL;
-        for (int j = 0; j < syncs->size(); ++j) {
-          P2PSync<Dtype>* sync = j == 0 ? this : syncs->at(j).get();
-          if (sync) {
-            const SolverParameter& p = sync->solver()->param();
-            if (p.device_id() == pairs[i].parent()) {
-              parent = sync;
-            }
-          }
-        }
-        if (parent) {
-          param.set_device_id(pairs[i].device());
-          syncs->at(i).reset(new P2PSync<Dtype>(solver_, parent, param));
-          parent->children_.push_back((P2PSync<Dtype>*) syncs->at(i).get());
-        }
-      }
-    }
-  }
-}
-
-template<typename Dtype>
-void P2PSync<Dtype>::Run(const vector<int>& gpus) {
-  vector<shared_ptr<P2PSync<Dtype> > > syncs(gpus.size());
-  Prepare(gpus, &syncs);
-
-  LOG(INFO)<< "Starting Optimization";
-
-  for (int i = 1; i < syncs.size(); ++i) {
-    syncs[i]->StartInternalThread();
-  }
-
-  // Run root solver on current thread
-  solver_->Solve();
-
-  for (int i = 1; i < syncs.size(); ++i) {
-    syncs[i]->StopInternalThread();
-  }
-}
-
-INSTANTIATE_CLASS(Params);
-INSTANTIATE_CLASS(GPUParams);
-INSTANTIATE_CLASS(P2PSync);
-
-}  // namespace caffe
diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp
deleted file mode 100644
index 77de6be..0000000
--- a/src/caffe/solver.cpp
+++ /dev/null
@@ -1,496 +0,0 @@
-#include <cstdio>
-
-#include <string>
-#include <vector>
-
-#include "caffe/solver.hpp"
-#include "caffe/util/format.hpp"
-#include "caffe/util/io.hpp"
-#include "caffe/util/upgrade_proto.hpp"
-
-namespace caffe {
-
-template<typename Dtype>
-void Solver<Dtype>::SetActionFunction(ActionCallback func) {
-  action_request_function_ = func;
-}
-
-template<typename Dtype>
-SolverAction::Enum Solver<Dtype>::GetRequestedAction() {
-  if (action_request_function_) {
-    // If the external request function has been set, call it.
-    return action_request_function_();
-  }
-  return SolverAction::NONE;
-}
-
-template <typename Dtype>
-Solver<Dtype>::Solver(const SolverParameter& param, const Solver* root_solver)
-    : net_(), callbacks_(), root_solver_(root_solver),
-      requested_early_exit_(false) {
-  Init(param);
-}
-
-template <typename Dtype>
-Solver<Dtype>::Solver(const string& param_file, const Solver* root_solver)
-    : net_(), callbacks_(), root_solver_(root_solver),
-      requested_early_exit_(false) {
-  SolverParameter param;
-  ReadSolverParamsFromTextFileOrDie(param_file, &param);
-  Init(param);
-}
-
-template <typename Dtype>
-void Solver<Dtype>::Init(const SolverParameter& param) {
-  CHECK(Caffe::root_solver() || root_solver_)
-      << "root_solver_ needs to be set for all non-root solvers";
-  LOG_IF(INFO, Caffe::root_solver()) << "Initializing solver from parameters: "
-    << std::endl << param.DebugString();
-  param_ = param;
-  CHECK_GE(param_.average_loss(), 1) << "average_loss should be non-negative.";
-  CheckSnapshotWritePermissions();
-  if (Caffe::root_solver() && param_.random_seed() >= 0) {
-    Caffe::set_random_seed(param_.random_seed());
-  }
-  // Scaffolding code
-  InitTrainNet();
-  if (Caffe::root_solver()) {
-    InitTestNets();
-    LOG(INFO) << "Solver scaffolding done.";
-  }
-  iter_ = 0;
-  current_step_ = 0;
-}
-
-template <typename Dtype>
-void Solver<Dtype>::InitTrainNet() {
-  const int num_train_nets = param_.has_net() + param_.has_net_param() +
-      param_.has_train_net() + param_.has_train_net_param();
-  const string& field_names = "net, net_param, train_net, train_net_param";
-  CHECK_GE(num_train_nets, 1) << "SolverParameter must specify a train net "
-      << "using one of these fields: " << field_names;
-  CHECK_LE(num_train_nets, 1) << "SolverParameter must not contain more than "
-      << "one of these fields specifying a train_net: " << field_names;
-  NetParameter net_param;
-  if (param_.has_train_net_param()) {
-    LOG_IF(INFO, Caffe::root_solver())
-        << "Creating training net specified in train_net_param.";
-    net_param.CopyFrom(param_.train_net_param());
-  } else if (param_.has_train_net()) {
-    LOG_IF(INFO, Caffe::root_solver())
-        << "Creating training net from train_net file: " << param_.train_net();
-    ReadNetParamsFromTextFileOrDie(param_.train_net(), &net_param);
-  }
-  if (param_.has_net_param()) {
-    LOG_IF(INFO, Caffe::root_solver())
-        << "Creating training net specified in net_param.";
-    net_param.CopyFrom(param_.net_param());
-  }
-  if (param_.has_net()) {
-    LOG_IF(INFO, Caffe::root_solver())
-        << "Creating training net from net file: " << param_.net();
-    ReadNetParamsFromTextFileOrDie(param_.net(), &net_param);
-  }
-  // Set the correct NetState.  We start with the solver defaults (lowest
-  // precedence); then, merge in any NetState specified by the net_param itself;
-  // finally, merge in any NetState specified by the train_state (highest
-  // precedence).
-  NetState net_state;
-  net_state.set_phase(TRAIN);
-  net_state.MergeFrom(net_param.state());
-  net_state.MergeFrom(param_.train_state());
-  net_param.mutable_state()->CopyFrom(net_state);
-  if (Caffe::root_solver()) {
-    net_.reset(new Net<Dtype>(net_param));
-  } else {
-    net_.reset(new Net<Dtype>(net_param, root_solver_->net_.get()));
-  }
-}
-
-template <typename Dtype>
-void Solver<Dtype>::InitTestNets() {
-  CHECK(Caffe::root_solver());
-  const bool has_net_param = param_.has_net_param();
-  const bool has_net_file = param_.has_net();
-  const int num_generic_nets = has_net_param + has_net_file;
-  CHECK_LE(num_generic_nets, 1)
-      << "Both net_param and net_file may not be specified.";
-  const int num_test_net_params = param_.test_net_param_size();
-  const int num_test_net_files = param_.test_net_size();
-  const int num_test_nets = num_test_net_params + num_test_net_files;
-  if (num_generic_nets) {
-      CHECK_GE(param_.test_iter_size(), num_test_nets)
-          << "test_iter must be specified for each test network.";
-  } else {
-      CHECK_EQ(param_.test_iter_size(), num_test_nets)
-          << "test_iter must be specified for each test network.";
-  }
-  // If we have a generic net (specified by net or net_param, rather than
-  // test_net or test_net_param), we may have an unlimited number of actual
-  // test networks -- the actual number is given by the number of remaining
-  // test_iters after any test nets specified by test_net_param and/or test_net
-  // are evaluated.
-  const int num_generic_net_instances = param_.test_iter_size() - num_test_nets;
-  const int num_test_net_instances = num_test_nets + num_generic_net_instances;
-  if (param_.test_state_size()) {
-    CHECK_EQ(param_.test_state_size(), num_test_net_instances)
-        << "test_state must be unspecified or specified once per test net.";
-  }
-  if (num_test_net_instances) {
-    CHECK_GT(param_.test_interval(), 0);
-  }
-  int test_net_id = 0;
-  vector<string> sources(num_test_net_instances);
-  vector<NetParameter> net_params(num_test_net_instances);
-  for (int i = 0; i < num_test_net_params; ++i, ++test_net_id) {
-      sources[test_net_id] = "test_net_param";
-      net_params[test_net_id].CopyFrom(param_.test_net_param(i));
-  }
-  for (int i = 0; i < num_test_net_files; ++i, ++test_net_id) {
-      sources[test_net_id] = "test_net file: " + param_.test_net(i);
-      ReadNetParamsFromTextFileOrDie(param_.test_net(i),
-          &net_params[test_net_id]);
-  }
-  const int remaining_test_nets = param_.test_iter_size() - test_net_id;
-  if (has_net_param) {
-    for (int i = 0; i < remaining_test_nets; ++i, ++test_net_id) {
-      sources[test_net_id] = "net_param";
-      net_params[test_net_id].CopyFrom(param_.net_param());
-    }
-  }
-  if (has_net_file) {
-    for (int i = 0; i < remaining_test_nets; ++i, ++test_net_id) {
-      sources[test_net_id] = "net file: " + param_.net();
-      ReadNetParamsFromTextFileOrDie(param_.net(), &net_params[test_net_id]);
-    }
-  }
-  test_nets_.resize(num_test_net_instances);
-  for (int i = 0; i < num_test_net_instances; ++i) {
-    // Set the correct NetState.  We start with the solver defaults (lowest
-    // precedence); then, merge in any NetState specified by the net_param
-    // itself; finally, merge in any NetState specified by the test_state
-    // (highest precedence).
-    NetState net_state;
-    net_state.set_phase(TEST);
-    net_state.MergeFrom(net_params[i].state());
-    if (param_.test_state_size()) {
-      net_state.MergeFrom(param_.test_state(i));
-    }
-    net_params[i].mutable_state()->CopyFrom(net_state);
-    LOG(INFO)
-        << "Creating test net (#" << i << ") specified by " << sources[i];
-    if (Caffe::root_solver()) {
-      test_nets_[i].reset(new Net<Dtype>(net_params[i]));
-    } else {
-      test_nets_[i].reset(new Net<Dtype>(net_params[i],
-          root_solver_->test_nets_[i].get()));
-    }
-    test_nets_[i]->set_debug_info(param_.debug_info());
-  }
-}
-
-template <typename Dtype>
-void Solver<Dtype>::Step(int iters) {
-  const int start_iter = iter_;
-  const int stop_iter = iter_ + iters;
-  int average_loss = this->param_.average_loss();
-  losses_.clear();
-  smoothed_loss_ = 0;
-
-  while (iter_ < stop_iter) {
-    // zero-init the params
-    net_->ClearParamDiffs();
-    if (param_.test_interval() && iter_ % param_.test_interval() == 0
-        && (iter_ > 0 || param_.test_initialization())
-        && Caffe::root_solver()) {
-      TestAll();
-      if (requested_early_exit_) {
-        // Break out of the while loop because stop was requested while testing.
-        break;
-      }
-    }
-
-    for (int i = 0; i < callbacks_.size(); ++i) {
-      callbacks_[i]->on_start();
-    }
-    const bool display = param_.display() && iter_ % param_.display() == 0;
-    net_->set_debug_info(display && param_.debug_info());
-    // accumulate the loss and gradient
-    Dtype loss = 0;
-    for (int i = 0; i < param_.iter_size(); ++i) {
-      loss += net_->ForwardBackward();
-    }
-    loss /= param_.iter_size();
-    // average the loss across iterations for smoothed reporting
-    UpdateSmoothedLoss(loss, start_iter, average_loss);
-    if (display) {
-      LOG_IF(INFO, Caffe::root_solver()) << "Iteration " << iter_
-          << ", loss = " << smoothed_loss_;
-      const vector<Blob<Dtype>*>& result = net_->output_blobs();
-      int score_index = 0;
-      for (int j = 0; j < result.size(); ++j) {
-        const Dtype* result_vec = result[j]->cpu_data();
-        const string& output_name =
-            net_->blob_names()[net_->output_blob_indices()[j]];
-        const Dtype loss_weight =
-            net_->blob_loss_weights()[net_->output_blob_indices()[j]];
-        for (int k = 0; k < result[j]->count(); ++k) {
-          ostringstream loss_msg_stream;
-          if (loss_weight) {
-            loss_msg_stream << " (* " << loss_weight
-                            << " = " << loss_weight * result_vec[k] << " loss)";
-          }
-          LOG_IF(INFO, Caffe::root_solver()) << "    Train net output #"
-              << score_index++ << ": " << output_name << " = "
-              << result_vec[k] << loss_msg_stream.str();
-        }
-      }
-    }
-    for (int i = 0; i < callbacks_.size(); ++i) {
-      callbacks_[i]->on_gradients_ready();
-    }
-    ApplyUpdate();
-
-    // Increment the internal iter_ counter -- its value should always indicate
-    // the number of times the weights have been updated.
-    ++iter_;
-
-    SolverAction::Enum request = GetRequestedAction();
-
-    // Save a snapshot if needed.
-    if ((param_.snapshot()
-         && iter_ % param_.snapshot() == 0
-         && Caffe::root_solver()) ||
-         (request == SolverAction::SNAPSHOT)) {
-      Snapshot();
-    }
-    if (SolverAction::STOP == request) {
-      requested_early_exit_ = true;
-      // Break out of training loop.
-      break;
-    }
-  }
-}
-
-template <typename Dtype>
-void Solver<Dtype>::Solve(const char* resume_file) {
-  CHECK(Caffe::root_solver());
-  LOG(INFO) << "Solving " << net_->name();
-  LOG(INFO) << "Learning Rate Policy: " << param_.lr_policy();
-
-  // Initialize to false every time we start solving.
-  requested_early_exit_ = false;
-
-  if (resume_file) {
-    LOG(INFO) << "Restoring previous solver status from " << resume_file;
-    Restore(resume_file);
-  }
-
-  // For a network that is trained by the solver, no bottom or top vecs
-  // should be given, and we will just provide dummy vecs.
-  int start_iter = iter_;
-  Step(param_.max_iter() - iter_);
-  // If we haven't already, save a snapshot after optimization, unless
-  // overridden by setting snapshot_after_train := false
-  if (param_.snapshot_after_train()
-      && (!param_.snapshot() || iter_ % param_.snapshot() != 0)) {
-    Snapshot();
-  }
-  if (requested_early_exit_) {
-    LOG(INFO) << "Optimization stopped early.";
-    return;
-  }
-  // After the optimization is done, run an additional train and test pass to
-  // display the train and test loss/outputs if appropriate (based on the
-  // display and test_interval settings, respectively).  Unlike in the rest of
-  // training, for the train net we only run a forward pass as we've already
-  // updated the parameters "max_iter" times -- this final pass is only done to
-  // display the loss, which is computed in the forward pass.
-  if (param_.display() && iter_ % param_.display() == 0) {
-    int average_loss = this->param_.average_loss();
-    Dtype loss;
-    net_->Forward(&loss);
-
-    UpdateSmoothedLoss(loss, start_iter, average_loss);
-
-    LOG(INFO) << "Iteration " << iter_ << ", loss = " << smoothed_loss_;
-  }
-  if (param_.test_interval() && iter_ % param_.test_interval() == 0) {
-    TestAll();
-  }
-  LOG(INFO) << "Optimization Done.";
-}
-
-template <typename Dtype>
-void Solver<Dtype>::TestAll() {
-  for (int test_net_id = 0;
-       test_net_id < test_nets_.size() && !requested_early_exit_;
-       ++test_net_id) {
-    Test(test_net_id);
-  }
-}
-
-template <typename Dtype>
-void Solver<Dtype>::Test(const int test_net_id) {
-  CHECK(Caffe::root_solver());
-  LOG(INFO) << "Iteration " << iter_
-            << ", Testing net (#" << test_net_id << ")";
-  CHECK_NOTNULL(test_nets_[test_net_id].get())->
-      ShareTrainedLayersWith(net_.get());
-  vector<Dtype> test_score;
-  vector<int> test_score_output_id;
-  const shared_ptr<Net<Dtype> >& test_net = test_nets_[test_net_id];
-  Dtype loss = 0;
-  for (int i = 0; i < param_.test_iter(test_net_id); ++i) {
-    SolverAction::Enum request = GetRequestedAction();
-    // Check to see if stoppage of testing/training has been requested.
-    while (request != SolverAction::NONE) {
-        if (SolverAction::SNAPSHOT == request) {
-          Snapshot();
-        } else if (SolverAction::STOP == request) {
-          requested_early_exit_ = true;
-        }
-        request = GetRequestedAction();
-    }
-    if (requested_early_exit_) {
-      // break out of test loop.
-      break;
-    }
-
-    Dtype iter_loss;
-    const vector<Blob<Dtype>*>& result =
-        test_net->Forward(&iter_loss);
-    if (param_.test_compute_loss()) {
-      loss += iter_loss;
-    }
-    if (i == 0) {
-      for (int j = 0; j < result.size(); ++j) {
-        const Dtype* result_vec = result[j]->cpu_data();
-        for (int k = 0; k < result[j]->count(); ++k) {
-          test_score.push_back(result_vec[k]);
-          test_score_output_id.push_back(j);
-        }
-      }
-    } else {
-      int idx = 0;
-      for (int j = 0; j < result.size(); ++j) {
-        const Dtype* result_vec = result[j]->cpu_data();
-        for (int k = 0; k < result[j]->count(); ++k) {
-          test_score[idx++] += result_vec[k];
-        }
-      }
-    }
-  }
-  if (requested_early_exit_) {
-    LOG(INFO)     << "Test interrupted.";
-    return;
-  }
-  if (param_.test_compute_loss()) {
-    loss /= param_.test_iter(test_net_id);
-    LOG(INFO) << "Test loss: " << loss;
-  }
-  for (int i = 0; i < test_score.size(); ++i) {
-    const int output_blob_index =
-        test_net->output_blob_indices()[test_score_output_id[i]];
-    const string& output_name = test_net->blob_names()[output_blob_index];
-    const Dtype loss_weight = test_net->blob_loss_weights()[output_blob_index];
-    ostringstream loss_msg_stream;
-    const Dtype mean_score = test_score[i] / param_.test_iter(test_net_id);
-    if (loss_weight) {
-      loss_msg_stream << " (* " << loss_weight
-                      << " = " << loss_weight * mean_score << " loss)";
-    }
-    LOG(INFO) << "    Test net output #" << i << ": " << output_name << " = "
-              << mean_score << loss_msg_stream.str();
-  }
-}
-
-template <typename Dtype>
-void Solver<Dtype>::Snapshot() {
-  CHECK(Caffe::root_solver());
-  string model_filename;
-  switch (param_.snapshot_format()) {
-  case caffe::SolverParameter_SnapshotFormat_BINARYPROTO:
-    model_filename = SnapshotToBinaryProto();
-    break;
-  case caffe::SolverParameter_SnapshotFormat_HDF5:
-    model_filename = SnapshotToHDF5();
-    break;
-  default:
-    LOG(FATAL) << "Unsupported snapshot format.";
-  }
-
-  SnapshotSolverState(model_filename);
-}
-
-template <typename Dtype>
-void Solver<Dtype>::CheckSnapshotWritePermissions() {
-  if (Caffe::root_solver() && param_.snapshot()) {
-    CHECK(param_.has_snapshot_prefix())
-        << "In solver params, snapshot is specified but snapshot_prefix is not";
-    string probe_filename = SnapshotFilename(".tempfile");
-    std::ofstream probe_ofs(probe_filename.c_str());
-    if (probe_ofs.good()) {
-      probe_ofs.close();
-      std::remove(probe_filename.c_str());
-    } else {
-      LOG(FATAL) << "Cannot write to snapshot prefix '"
-          << param_.snapshot_prefix() << "'.  Make sure "
-          << "that the directory exists and is writeable.";
-    }
-  }
-}
-
-template <typename Dtype>
-string Solver<Dtype>::SnapshotFilename(const string extension) {
-  return param_.snapshot_prefix() + "_iter_" + caffe::format_int(iter_)
-    + extension;
-}
-
-template <typename Dtype>
-string Solver<Dtype>::SnapshotToBinaryProto() {
-  string model_filename = SnapshotFilename(".caffemodel");
-  LOG(INFO) << "Snapshotting to binary proto file " << model_filename;
-  NetParameter net_param;
-  net_->ToProto(&net_param, param_.snapshot_diff());
-  WriteProtoToBinaryFile(net_param, model_filename);
-  return model_filename;
-}
-
-template <typename Dtype>
-string Solver<Dtype>::SnapshotToHDF5() {
-  string model_filename = SnapshotFilename(".caffemodel.h5");
-  LOG(INFO) << "Snapshotting to HDF5 file " << model_filename;
-  net_->ToHDF5(model_filename, param_.snapshot_diff());
-  return model_filename;
-}
-
-template <typename Dtype>
-void Solver<Dtype>::Restore(const char* state_file) {
-  CHECK(Caffe::root_solver());
-  string state_filename(state_file);
-  if (state_filename.size() >= 3 &&
-      state_filename.compare(state_filename.size() - 3, 3, ".h5") == 0) {
-    RestoreSolverStateFromHDF5(state_filename);
-  } else {
-    RestoreSolverStateFromBinaryProto(state_filename);
-  }
-}
-
-template <typename Dtype>
-void Solver<Dtype>::UpdateSmoothedLoss(Dtype loss, int start_iter,
-    int average_loss) {
-  if (losses_.size() < average_loss) {
-    losses_.push_back(loss);
-    int size = losses_.size();
-    smoothed_loss_ = (smoothed_loss_ * (size - 1) + loss) / size;
-  } else {
-    int idx = (iter_ - start_iter) % average_loss;
-    smoothed_loss_ += (loss - losses_[idx]) / average_loss;
-    losses_[idx] = loss;
-  }
-}
-
-INSTANTIATE_CLASS(Solver);
-
-}  // namespace caffe
diff --git a/src/caffe/solvers/adadelta_solver.cpp b/src/caffe/solvers/adadelta_solver.cpp
deleted file mode 100644
index fd30f19..0000000
--- a/src/caffe/solvers/adadelta_solver.cpp
+++ /dev/null
@@ -1,112 +0,0 @@
-#include <vector>
-
-#include "caffe/sgd_solvers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void AdaDeltaSolver<Dtype>::AdaDeltaPreSolve() {
-  // Add the extra history entries for AdaDelta after those from
-  // SGDSolver::PreSolve
-  const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params();
-  for (int i = 0; i < net_params.size(); ++i) {
-        const vector<int>& shape = net_params[i]->shape();
-        this->history_.push_back(
-                shared_ptr<Blob<Dtype> >(new Blob<Dtype>(shape)));
-  }
-}
-
-#ifndef CPU_ONLY
-template <typename Dtype>
-void adadelta_update_gpu(int N, Dtype* g, Dtype* h, Dtype* h2, Dtype momentum,
-    Dtype delta, Dtype local_rate);
-#endif
-
-template <typename Dtype>
-void AdaDeltaSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
-  const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params();
-  const vector<float>& net_params_lr = this->net_->params_lr();
-  Dtype delta = this->param_.delta();
-  Dtype momentum = this->param_.momentum();
-  Dtype local_rate = rate * net_params_lr[param_id];
-  size_t update_history_offset = net_params.size();
-  switch (Caffe::mode()) {
-  case Caffe::CPU: {
-    // compute square of gradient in update
-    caffe_powx(net_params[param_id]->count(),
-        net_params[param_id]->cpu_diff(), Dtype(2),
-        this->update_[param_id]->mutable_cpu_data());
-
-    // update history of gradients
-    caffe_cpu_axpby(net_params[param_id]->count(), Dtype(1) - momentum,
-        this->update_[param_id]->cpu_data(), momentum,
-        this->history_[param_id]->mutable_cpu_data());
-
-    // add delta to history to guard against dividing by zero later
-    caffe_set(net_params[param_id]->count(), delta,
-        this->temp_[param_id]->mutable_cpu_data());
-
-    caffe_add(net_params[param_id]->count(),
-        this->temp_[param_id]->cpu_data(),
-        this->history_[update_history_offset + param_id]->cpu_data(),
-        this->update_[param_id]->mutable_cpu_data());
-
-    caffe_add(net_params[param_id]->count(),
-        this->temp_[param_id]->cpu_data(),
-        this->history_[param_id]->cpu_data(),
-        this->temp_[param_id]->mutable_cpu_data());
-
-    // divide history of updates by history of gradients
-    caffe_div(net_params[param_id]->count(),
-        this->update_[param_id]->cpu_data(),
-        this->temp_[param_id]->cpu_data(),
-        this->update_[param_id]->mutable_cpu_data());
-
-    // jointly compute the RMS of both for update and gradient history
-    caffe_powx(net_params[param_id]->count(),
-        this->update_[param_id]->cpu_data(), Dtype(0.5),
-        this->update_[param_id]->mutable_cpu_data());
-
-    // compute the update
-    caffe_mul(net_params[param_id]->count(),
-        net_params[param_id]->cpu_diff(),
-        this->update_[param_id]->cpu_data(),
-        net_params[param_id]->mutable_cpu_diff());
-
-    // compute square of update
-    caffe_powx(net_params[param_id]->count(),
-        net_params[param_id]->cpu_diff(), Dtype(2),
-        this->update_[param_id]->mutable_cpu_data());
-
-    // update history of updates
-    caffe_cpu_axpby(net_params[param_id]->count(), Dtype(1) - momentum,
-        this->update_[param_id]->cpu_data(), momentum,
-        this->history_[update_history_offset + param_id]->mutable_cpu_data());
-
-    // apply learning rate
-    caffe_cpu_scale(net_params[param_id]->count(), local_rate,
-        net_params[param_id]->cpu_diff(),
-        net_params[param_id]->mutable_cpu_diff());
-    break;
-  }
-  case Caffe::GPU: {
-#ifndef CPU_ONLY
-    adadelta_update_gpu(net_params[param_id]->count(),
-        net_params[param_id]->mutable_gpu_diff(),
-        this->history_[param_id]->mutable_gpu_data(),
-        this->history_[update_history_offset + param_id]->mutable_gpu_data(),
-        momentum, delta, local_rate);
-#else
-    NO_GPU;
-#endif
-    break;
-  }
-  default:
-    LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode();
-  }
-}
-
-INSTANTIATE_CLASS(AdaDeltaSolver);
-REGISTER_SOLVER_CLASS(AdaDelta);
-
-}  // namespace caffe
diff --git a/src/caffe/solvers/adadelta_solver.cu b/src/caffe/solvers/adadelta_solver.cu
deleted file mode 100644
index 6c94585..0000000
--- a/src/caffe/solvers/adadelta_solver.cu
+++ /dev/null
@@ -1,30 +0,0 @@
-#include "caffe/util/math_functions.hpp"
-
-
-namespace caffe {
-
-template <typename Dtype>
-__global__ void AdaDeltaUpdate(int N, Dtype* g, Dtype* h, Dtype* h2,
-    Dtype momentum, Dtype delta, Dtype local_rate) {
-  CUDA_KERNEL_LOOP(i, N) {
-    float gi = g[i];
-    float hi = h[i] = momentum * h[i] + (1-momentum) * gi * gi;
-    gi = gi * sqrt((h2[i] + delta) / (hi + delta));
-    h2[i] = momentum * h2[i] + (1-momentum) * gi * gi;
-    g[i] = local_rate * gi;
-  }
-}
-template <typename Dtype>
-void adadelta_update_gpu(int N, Dtype* g, Dtype* h, Dtype* h2, Dtype momentum,
-    Dtype delta, Dtype local_rate) {
-  AdaDeltaUpdate<Dtype>  // NOLINT_NEXT_LINE(whitespace/operators)
-      <<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
-      N, g, h, h2, momentum, delta, local_rate);
-  CUDA_POST_KERNEL_CHECK;
-}
-template void adadelta_update_gpu<float>(int , float*, float*, float*,
-    float, float, float);
-template void adadelta_update_gpu<double>(int, double*, double*, double*,
-    double, double, double);
-
-}  // namespace caffe
diff --git a/src/caffe/solvers/adagrad_solver.cpp b/src/caffe/solvers/adagrad_solver.cpp
deleted file mode 100644
index e78eadc..0000000
--- a/src/caffe/solvers/adagrad_solver.cpp
+++ /dev/null
@@ -1,70 +0,0 @@
-#include <vector>
-
-#include "caffe/sgd_solvers.hpp"
-
-namespace caffe {
-
-#ifndef CPU_ONLY
-template <typename Dtype>
-void adagrad_update_gpu(int N, Dtype* g, Dtype* h, Dtype delta,
-    Dtype local_rate);
-#endif
-
-template <typename Dtype>
-void AdaGradSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
-  CHECK(Caffe::root_solver());
-  const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params();
-  const vector<float>& net_params_lr = this->net_->params_lr();
-  Dtype delta = this->param_.delta();
-  Dtype local_rate = rate * net_params_lr[param_id];
-  switch (Caffe::mode()) {
-  case Caffe::CPU: {
-    // compute square of gradient in update
-    caffe_powx(net_params[param_id]->count(),
-        net_params[param_id]->cpu_diff(), Dtype(2),
-        this->update_[param_id]->mutable_cpu_data());
-
-    // update history
-    caffe_add(net_params[param_id]->count(),
-        this->update_[param_id]->cpu_data(),
-        this->history_[param_id]->cpu_data(),
-        this->history_[param_id]->mutable_cpu_data());
-
-    // prepare update
-    caffe_powx(net_params[param_id]->count(),
-              this->history_[param_id]->cpu_data(), Dtype(0.5),
-              this->update_[param_id]->mutable_cpu_data());
-
-    caffe_add_scalar(net_params[param_id]->count(),
-              delta, this->update_[param_id]->mutable_cpu_data());
-
-    caffe_div(net_params[param_id]->count(),
-              net_params[param_id]->cpu_diff(),
-              this->update_[param_id]->cpu_data(),
-              this->update_[param_id]->mutable_cpu_data());
-
-    // scale and copy
-    caffe_cpu_axpby(net_params[param_id]->count(), local_rate,
-        this->update_[param_id]->cpu_data(), Dtype(0),
-        net_params[param_id]->mutable_cpu_diff());
-    break;
-  }
-  case Caffe::GPU: {
-#ifndef CPU_ONLY
-    adagrad_update_gpu(net_params[param_id]->count(),
-        net_params[param_id]->mutable_gpu_diff(),
-        this->history_[param_id]->mutable_gpu_data(), delta, local_rate);
-#else
-    NO_GPU;
-#endif
-    break;
-  }
-  default:
-    LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode();
-  }
-}
-
-INSTANTIATE_CLASS(AdaGradSolver);
-REGISTER_SOLVER_CLASS(AdaGrad);
-
-}  // namespace caffe
diff --git a/src/caffe/solvers/adagrad_solver.cu b/src/caffe/solvers/adagrad_solver.cu
deleted file mode 100644
index adefd55..0000000
--- a/src/caffe/solvers/adagrad_solver.cu
+++ /dev/null
@@ -1,26 +0,0 @@
-#include "caffe/util/math_functions.hpp"
-
-
-namespace caffe {
-
-template <typename Dtype>
-__global__ void AdaGradUpdate(int N, Dtype* g, Dtype* h, Dtype delta,
-    Dtype local_rate) {
-  CUDA_KERNEL_LOOP(i, N) {
-    float gi = g[i];
-    float hi = h[i] = h[i] + gi*gi;
-    g[i] = local_rate * gi / (sqrt(hi) + delta);
-  }
-}
-template <typename Dtype>
-void adagrad_update_gpu(int N, Dtype* g, Dtype* h, Dtype delta,
-    Dtype local_rate) {
-  AdaGradUpdate<Dtype>  // NOLINT_NEXT_LINE(whitespace/operators)
-      <<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
-      N, g, h, delta, local_rate);
-  CUDA_POST_KERNEL_CHECK;
-}
-template void adagrad_update_gpu<float>(int, float*, float*, float, float);
-template void adagrad_update_gpu<double>(int, double*, double*, double, double);
-
-}  // namespace caffe
diff --git a/src/caffe/solvers/adam_solver.cpp b/src/caffe/solvers/adam_solver.cpp
deleted file mode 100644
index 4a91f00..0000000
--- a/src/caffe/solvers/adam_solver.cpp
+++ /dev/null
@@ -1,94 +0,0 @@
-#include <vector>
-
-#include "caffe/sgd_solvers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void AdamSolver<Dtype>::AdamPreSolve() {
-  // Add the extra history entries for Adam after those from
-  // SGDSolver::PreSolve
-  const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params();
-  for (int i = 0; i < net_params.size(); ++i) {
-    const vector<int>& shape = net_params[i]->shape();
-    this->history_.push_back(
-            shared_ptr<Blob<Dtype> >(new Blob<Dtype>(shape)));
-  }
-}
-
-#ifndef CPU_ONLY
-template <typename Dtype>
-void adam_update_gpu(int N, Dtype* g, Dtype* m, Dtype* v, Dtype beta1,
-    Dtype beta2, Dtype eps_hat, Dtype corrected_local_rate);
-#endif
-
-template <typename Dtype>
-void AdamSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
-  const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params();
-  const vector<float>& net_params_lr = this->net_->params_lr();
-  Dtype local_rate = rate * net_params_lr[param_id];
-  const Dtype beta1 = this->param_.momentum();
-  const Dtype beta2 = this->param_.momentum2();
-
-  // we create aliases for convenience
-  size_t update_history_offset = net_params.size();
-  Blob<Dtype>* val_m = this->history_[param_id].get();
-  Blob<Dtype>* val_v = this->history_[param_id + update_history_offset].get();
-  Blob<Dtype>* val_t = this->temp_[param_id].get();
-
-  const int t = this->iter_ + 1;
-  const Dtype correction = std::sqrt(Dtype(1) - pow(beta2, t)) /
-      (Dtype(1.) - pow(beta1, t));
-  const int N = net_params[param_id]->count();
-  const Dtype eps_hat = this->param_.delta();
-
-  switch (Caffe::mode()) {
-    case Caffe::CPU: {
-    // update m <- \beta_1 m_{t-1} + (1-\beta_1)g_t
-    caffe_cpu_axpby(N, Dtype(1)-beta1,
-        net_params[param_id]->cpu_diff(), beta1,
-        val_m->mutable_cpu_data());
-
-    // update v <- \beta_2 m_{t-1} + (1-\beta_2)g_t^2
-    caffe_mul(N,
-        net_params[param_id]->cpu_diff(),
-        net_params[param_id]->cpu_diff(),
-    val_t->mutable_cpu_data());
-    caffe_cpu_axpby(N, Dtype(1)-beta2,
-        val_t->cpu_data(), beta2,
-        val_v->mutable_cpu_data());
-
-    // set update
-    caffe_powx(N,
-        val_v->cpu_data(), Dtype(0.5),
-        val_t->mutable_cpu_data());
-    caffe_add_scalar(N, eps_hat, val_t->mutable_cpu_data());
-    caffe_div(N,
-        val_m->cpu_data(),
-        val_t->cpu_data(),
-        val_t->mutable_cpu_data());
-
-    caffe_cpu_scale(N, local_rate*correction,
-        val_t->cpu_data(),
-        net_params[param_id]->mutable_cpu_diff());
-    break;
-  }
-  case Caffe::GPU: {
-#ifndef CPU_ONLY
-    adam_update_gpu(N, net_params[param_id]->mutable_gpu_diff(),
-        val_m->mutable_gpu_data(), val_v->mutable_gpu_data(), beta1, beta2,
-        eps_hat, local_rate*correction);
-#else
-    NO_GPU;
-#endif
-    break;
-  }
-  default:
-    LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode();
-  }
-}
-
-INSTANTIATE_CLASS(AdamSolver);
-REGISTER_SOLVER_CLASS(Adam);
-
-}  // namespace caffe
diff --git a/src/caffe/solvers/adam_solver.cu b/src/caffe/solvers/adam_solver.cu
deleted file mode 100644
index 917ae10..0000000
--- a/src/caffe/solvers/adam_solver.cu
+++ /dev/null
@@ -1,29 +0,0 @@
-#include "caffe/util/math_functions.hpp"
-
-
-namespace caffe {
-
-template <typename Dtype>
-__global__ void AdamUpdate(int N, Dtype* g, Dtype* m, Dtype* v,
-    Dtype beta1, Dtype beta2, Dtype eps_hat, Dtype corrected_local_rate) {
-  CUDA_KERNEL_LOOP(i, N) {
-    float gi = g[i];
-    float mi = m[i] = m[i]*beta1 + gi*(1-beta1);
-    float vi = v[i] = v[i]*beta2 + gi*gi*(1-beta2);
-    g[i] = corrected_local_rate * mi / (sqrt(vi) + eps_hat);
-  }
-}
-template <typename Dtype>
-void adam_update_gpu(int N, Dtype* g, Dtype* m, Dtype* v, Dtype beta1,
-    Dtype beta2, Dtype eps_hat, Dtype corrected_local_rate) {
-  AdamUpdate<Dtype>  // NOLINT_NEXT_LINE(whitespace/operators)
-      <<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
-      N, g, m, v, beta1, beta2, eps_hat, corrected_local_rate);
-  CUDA_POST_KERNEL_CHECK;
-}
-template void adam_update_gpu<float>(int, float*, float*, float*,
-    float, float, float, float);
-template void adam_update_gpu<double>(int, double*, double*, double*,
-    double, double, double, double);
-
-}  // namespace caffe
diff --git a/src/caffe/solvers/nesterov_solver.cpp b/src/caffe/solvers/nesterov_solver.cpp
deleted file mode 100644
index 23ab2d4..0000000
--- a/src/caffe/solvers/nesterov_solver.cpp
+++ /dev/null
@@ -1,62 +0,0 @@
-#include <vector>
-
-#include "caffe/sgd_solvers.hpp"
-
-namespace caffe {
-
-#ifndef CPU_ONLY
-template <typename Dtype>
-void nesterov_update_gpu(int N, Dtype* g, Dtype* h, Dtype momentum,
-    Dtype local_rate);
-#endif
-
-template <typename Dtype>
-void NesterovSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
-  CHECK(Caffe::root_solver());
-  const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params();
-  const vector<float>& net_params_lr = this->net_->params_lr();
-  Dtype momentum = this->param_.momentum();
-  Dtype local_rate = rate * net_params_lr[param_id];
-  switch (Caffe::mode()) {
-  case Caffe::CPU: {
-    // save history momentum for stepping back
-    caffe_copy(net_params[param_id]->count(),
-        this->history_[param_id]->cpu_data(),
-        this->update_[param_id]->mutable_cpu_data());
-
-    // update history
-    caffe_cpu_axpby(net_params[param_id]->count(), local_rate,
-              net_params[param_id]->cpu_diff(), momentum,
-              this->history_[param_id]->mutable_cpu_data());
-
-    // compute update: step back then over step
-    caffe_cpu_axpby(net_params[param_id]->count(), Dtype(1) + momentum,
-        this->history_[param_id]->cpu_data(), -momentum,
-        this->update_[param_id]->mutable_cpu_data());
-
-    // copy
-    caffe_copy(net_params[param_id]->count(),
-        this->update_[param_id]->cpu_data(),
-        net_params[param_id]->mutable_cpu_diff());
-    break;
-  }
-  case Caffe::GPU: {
-#ifndef CPU_ONLY
-    nesterov_update_gpu(net_params[param_id]->count(),
-        net_params[param_id]->mutable_gpu_diff(),
-        this->history_[param_id]->mutable_gpu_data(),
-        momentum, local_rate);
-#else
-    NO_GPU;
-#endif
-    break;
-  }
-  default:
-    LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode();
-  }
-}
-
-INSTANTIATE_CLASS(NesterovSolver);
-REGISTER_SOLVER_CLASS(Nesterov);
-
-}  // namespace caffe
diff --git a/src/caffe/solvers/nesterov_solver.cu b/src/caffe/solvers/nesterov_solver.cu
deleted file mode 100644
index 57a456b..0000000
--- a/src/caffe/solvers/nesterov_solver.cu
+++ /dev/null
@@ -1,27 +0,0 @@
-#include "caffe/util/math_functions.hpp"
-
-
-namespace caffe {
-
-template <typename Dtype>
-__global__ void NesterovUpdate(int N, Dtype* g, Dtype* h,
-    Dtype momentum, Dtype local_rate) {
-  CUDA_KERNEL_LOOP(i, N) {
-    float hi = h[i];
-    float hi_new = h[i] = momentum * hi + local_rate * g[i];
-    g[i] = (1+momentum) * hi_new - momentum * hi;
-  }
-}
-template <typename Dtype>
-void nesterov_update_gpu(int N, Dtype* g, Dtype* h, Dtype momentum,
-    Dtype local_rate) {
-  NesterovUpdate<Dtype>  // NOLINT_NEXT_LINE(whitespace/operators)
-      <<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
-      N, g, h, momentum, local_rate);
-  CUDA_POST_KERNEL_CHECK;
-}
-template void nesterov_update_gpu<float>(int, float*, float*, float, float);
-template void nesterov_update_gpu<double>(int, double*, double*, double,
-    double);
-
-}  // namespace caffe
diff --git a/src/caffe/solvers/rmsprop_solver.cpp b/src/caffe/solvers/rmsprop_solver.cpp
deleted file mode 100644
index 3251ee4..0000000
--- a/src/caffe/solvers/rmsprop_solver.cpp
+++ /dev/null
@@ -1,70 +0,0 @@
-#include <vector>
-
-#include "caffe/sgd_solvers.hpp"
-
-namespace caffe {
-
-#ifndef CPU_ONLY
-template <typename Dtype>
-void rmsprop_update_gpu(int N, Dtype* g, Dtype* h, Dtype rms_decay,
-    Dtype delta, Dtype local_rate);
-#endif
-
-template <typename Dtype>
-void RMSPropSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
-  const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params();
-  const vector<float>& net_params_lr = this->net_->params_lr();
-
-  // get the learning rate
-  Dtype delta = this->param_.delta();
-  Dtype rms_decay = this->param_.rms_decay();
-  Dtype local_rate = rate * net_params_lr[param_id];
-
-  switch (Caffe::mode()) {
-  case Caffe::CPU:
-    // compute square of gradient in update
-    caffe_powx(net_params[param_id]->count(),
-        net_params[param_id]->cpu_diff(), Dtype(2),
-        this->update_[param_id]->mutable_cpu_data());
-
-    // update history
-    caffe_cpu_axpby(net_params[param_id] -> count(),
-        Dtype(1-rms_decay), this->update_[param_id]->cpu_data(),
-        rms_decay, this->history_[param_id]-> mutable_cpu_data());
-
-    // prepare update
-    caffe_powx(net_params[param_id]->count(),
-        this->history_[param_id]->cpu_data(), Dtype(0.5),
-        this->update_[param_id]->mutable_cpu_data());
-
-    caffe_add_scalar(net_params[param_id]->count(),
-        delta, this->update_[param_id]->mutable_cpu_data());
-
-    caffe_div(net_params[param_id]->count(),
-        net_params[param_id]->cpu_diff(), this->update_[param_id]->cpu_data(),
-        this->update_[param_id]->mutable_cpu_data());
-
-    // scale and copy
-    caffe_cpu_axpby(net_params[param_id]->count(), local_rate,
-        this->update_[param_id]->cpu_data(), Dtype(0),
-        net_params[param_id]->mutable_cpu_diff());
-    break;
-  case Caffe::GPU:
-#ifndef CPU_ONLY
-    rmsprop_update_gpu(net_params[param_id]->count(),
-        net_params[param_id]->mutable_gpu_diff(),
-        this->history_[param_id]->mutable_gpu_data(),
-        rms_decay, delta, local_rate);
-#else
-    NO_GPU;
-#endif
-    break;
-  default:
-    LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode();
-  }
-}
-
-INSTANTIATE_CLASS(RMSPropSolver);
-REGISTER_SOLVER_CLASS(RMSProp);
-
-}  // namespace caffe
diff --git a/src/caffe/solvers/rmsprop_solver.cu b/src/caffe/solvers/rmsprop_solver.cu
deleted file mode 100644
index c5ffd32..0000000
--- a/src/caffe/solvers/rmsprop_solver.cu
+++ /dev/null
@@ -1,28 +0,0 @@
-#include "caffe/util/math_functions.hpp"
-
-
-namespace caffe {
-
-template <typename Dtype>
-__global__ void RMSPropUpdate(int N, Dtype* g, Dtype* h,
-    Dtype rms_decay, Dtype delta, Dtype local_rate) {
-  CUDA_KERNEL_LOOP(i, N) {
-    float gi = g[i];
-    float hi = h[i] = rms_decay*h[i] + (1-rms_decay)*gi*gi;
-    g[i] = local_rate * g[i] / (sqrt(hi) + delta);
-  }
-}
-template <typename Dtype>
-void rmsprop_update_gpu(int N, Dtype* g, Dtype* h, Dtype rms_decay,
-    Dtype delta, Dtype local_rate) {
-  RMSPropUpdate<Dtype>  // NOLINT_NEXT_LINE(whitespace/operators)
-      <<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
-      N, g, h, rms_decay, delta, local_rate);
-  CUDA_POST_KERNEL_CHECK;
-}
-template void rmsprop_update_gpu<float>(int, float*, float*, float, float,
-    float);
-template void rmsprop_update_gpu<double>(int, double*, double*, double, double,
-    double);
-
-}  // namespace caffe
diff --git a/src/caffe/solvers/sgd_solver.cpp b/src/caffe/solvers/sgd_solver.cpp
deleted file mode 100644
index f30f316..0000000
--- a/src/caffe/solvers/sgd_solver.cpp
+++ /dev/null
@@ -1,352 +0,0 @@
-#include <string>
-#include <vector>
-
-#include "caffe/sgd_solvers.hpp"
-#include "caffe/util/hdf5.hpp"
-#include "caffe/util/io.hpp"
-#include "caffe/util/upgrade_proto.hpp"
-
-namespace caffe {
-
-// Return the current learning rate. The currently implemented learning rate
-// policies are as follows:
-//    - fixed: always return base_lr.
-//    - step: return base_lr * gamma ^ (floor(iter / step))
-//    - exp: return base_lr * gamma ^ iter
-//    - inv: return base_lr * (1 + gamma * iter) ^ (- power)
-//    - multistep: similar to step but it allows non uniform steps defined by
-//      stepvalue
-//    - poly: the effective learning rate follows a polynomial decay, to be
-//      zero by the max_iter. return base_lr (1 - iter/max_iter) ^ (power)
-//    - sigmoid: the effective learning rate follows a sigmod decay
-//      return base_lr ( 1/(1 + exp(-gamma * (iter - stepsize))))
-//
-// where base_lr, max_iter, gamma, step, stepvalue and power are defined
-// in the solver parameter protocol buffer, and iter is the current iteration.
-template <typename Dtype>
-Dtype SGDSolver<Dtype>::GetLearningRate() {
-  Dtype rate;
-  const string& lr_policy = this->param_.lr_policy();
-  if (lr_policy == "fixed") {
-    rate = this->param_.base_lr();
-  } else if (lr_policy == "step") {
-    this->current_step_ = this->iter_ / this->param_.stepsize();
-    rate = this->param_.base_lr() *
-        pow(this->param_.gamma(), this->current_step_);
-  } else if (lr_policy == "exp") {
-    rate = this->param_.base_lr() * pow(this->param_.gamma(), this->iter_);
-  } else if (lr_policy == "inv") {
-    rate = this->param_.base_lr() *
-        pow(Dtype(1) + this->param_.gamma() * this->iter_,
-            - this->param_.power());
-  } else if (lr_policy == "multistep") {
-    if (this->current_step_ < this->param_.stepvalue_size() &&
-          this->iter_ >= this->param_.stepvalue(this->current_step_)) {
-      this->current_step_++;
-      LOG(INFO) << "MultiStep Status: Iteration " <<
-      this->iter_ << ", step = " << this->current_step_;
-    }
-    rate = this->param_.base_lr() *
-        pow(this->param_.gamma(), this->current_step_);
-  } else if (lr_policy == "poly") {
-    rate = this->param_.base_lr() * pow(Dtype(1.) -
-        (Dtype(this->iter_) / Dtype(this->param_.max_iter())),
-        this->param_.power());
-  } else if (lr_policy == "sigmoid") {
-    rate = this->param_.base_lr() * (Dtype(1.) /
-        (Dtype(1.) + exp(-this->param_.gamma() * (Dtype(this->iter_) -
-          Dtype(this->param_.stepsize())))));
-  } else {
-    LOG(FATAL) << "Unknown learning rate policy: " << lr_policy;
-  }
-  return rate;
-}
-
-template <typename Dtype>
-void SGDSolver<Dtype>::PreSolve() {
-  // Initialize the history
-  const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params();
-  history_.clear();
-  update_.clear();
-  temp_.clear();
-  for (int i = 0; i < net_params.size(); ++i) {
-    const vector<int>& shape = net_params[i]->shape();
-    history_.push_back(shared_ptr<Blob<Dtype> >(new Blob<Dtype>(shape)));
-    update_.push_back(shared_ptr<Blob<Dtype> >(new Blob<Dtype>(shape)));
-    temp_.push_back(shared_ptr<Blob<Dtype> >(new Blob<Dtype>(shape)));
-  }
-}
-
-template <typename Dtype>
-void SGDSolver<Dtype>::ClipGradients() {
-  const Dtype clip_gradients = this->param_.clip_gradients();
-  if (clip_gradients < 0) { return; }
-  const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params();
-  Dtype sumsq_diff = 0;
-  for (int i = 0; i < net_params.size(); ++i) {
-    sumsq_diff += net_params[i]->sumsq_diff();
-  }
-  const Dtype l2norm_diff = std::sqrt(sumsq_diff);
-  if (l2norm_diff > clip_gradients) {
-    Dtype scale_factor = clip_gradients / l2norm_diff;
-    LOG(INFO) << "Gradient clipping: scaling down gradients (L2 norm "
-        << l2norm_diff << " > " << clip_gradients << ") "
-        << "by scale factor " << scale_factor;
-    for (int i = 0; i < net_params.size(); ++i) {
-      net_params[i]->scale_diff(scale_factor);
-    }
-  }
-}
-
-template <typename Dtype>
-void SGDSolver<Dtype>::ApplyUpdate() {
-  CHECK(Caffe::root_solver());
-  Dtype rate = GetLearningRate();
-  if (this->param_.display() && this->iter_ % this->param_.display() == 0) {
-    LOG(INFO) << "Iteration " << this->iter_ << ", lr = " << rate;
-  }
-  ClipGradients();
-  for (int param_id = 0; param_id < this->net_->learnable_params().size();
-       ++param_id) {
-    Normalize(param_id);
-    Regularize(param_id);
-    ComputeUpdateValue(param_id, rate);
-  }
-  this->net_->Update();
-}
-
-template <typename Dtype>
-void SGDSolver<Dtype>::Normalize(int param_id) {
-  if (this->param_.iter_size() == 1) { return; }
-  // Scale gradient to counterbalance accumulation.
-  const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params();
-  const Dtype accum_normalization = Dtype(1.) / this->param_.iter_size();
-  switch (Caffe::mode()) {
-  case Caffe::CPU: {
-    caffe_scal(net_params[param_id]->count(), accum_normalization,
-        net_params[param_id]->mutable_cpu_diff());
-    break;
-  }
-  case Caffe::GPU: {
-#ifndef CPU_ONLY
-    caffe_gpu_scal(net_params[param_id]->count(), accum_normalization,
-        net_params[param_id]->mutable_gpu_diff());
-#else
-    NO_GPU;
-#endif
-    break;
-  }
-  default:
-    LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode();
-  }
-}
-
-template <typename Dtype>
-void SGDSolver<Dtype>::Regularize(int param_id) {
-  const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params();
-  const vector<float>& net_params_weight_decay =
-      this->net_->params_weight_decay();
-  Dtype weight_decay = this->param_.weight_decay();
-  string regularization_type = this->param_.regularization_type();
-  Dtype local_decay = weight_decay * net_params_weight_decay[param_id];
-  switch (Caffe::mode()) {
-  case Caffe::CPU: {
-    if (local_decay) {
-      if (regularization_type == "L2") {
-        // add weight decay
-        caffe_axpy(net_params[param_id]->count(),
-            local_decay,
-            net_params[param_id]->cpu_data(),
-            net_params[param_id]->mutable_cpu_diff());
-      } else if (regularization_type == "L1") {
-        caffe_cpu_sign(net_params[param_id]->count(),
-            net_params[param_id]->cpu_data(),
-            temp_[param_id]->mutable_cpu_data());
-        caffe_axpy(net_params[param_id]->count(),
-            local_decay,
-            temp_[param_id]->cpu_data(),
-            net_params[param_id]->mutable_cpu_diff());
-      } else {
-        LOG(FATAL) << "Unknown regularization type: " << regularization_type;
-      }
-    }
-    break;
-  }
-  case Caffe::GPU: {
-#ifndef CPU_ONLY
-    if (local_decay) {
-      if (regularization_type == "L2") {
-        // add weight decay
-        caffe_gpu_axpy(net_params[param_id]->count(),
-            local_decay,
-            net_params[param_id]->gpu_data(),
-            net_params[param_id]->mutable_gpu_diff());
-      } else if (regularization_type == "L1") {
-        caffe_gpu_sign(net_params[param_id]->count(),
-            net_params[param_id]->gpu_data(),
-            temp_[param_id]->mutable_gpu_data());
-        caffe_gpu_axpy(net_params[param_id]->count(),
-            local_decay,
-            temp_[param_id]->gpu_data(),
-            net_params[param_id]->mutable_gpu_diff());
-      } else {
-        LOG(FATAL) << "Unknown regularization type: " << regularization_type;
-      }
-    }
-#else
-    NO_GPU;
-#endif
-    break;
-  }
-  default:
-    LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode();
-  }
-}
-
-#ifndef CPU_ONLY
-template <typename Dtype>
-void sgd_update_gpu(int N, Dtype* g, Dtype* h, Dtype momentum,
-    Dtype local_rate);
-#endif
-
-template <typename Dtype>
-void SGDSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
-  const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params();
-  const vector<float>& net_params_lr = this->net_->params_lr();
-  Dtype momentum = this->param_.momentum();
-  Dtype local_rate = rate * net_params_lr[param_id];
-  // Compute the update to history, then copy it to the parameter diff.
-  switch (Caffe::mode()) {
-  case Caffe::CPU: {
-    caffe_cpu_axpby(net_params[param_id]->count(), local_rate,
-              net_params[param_id]->cpu_diff(), momentum,
-              history_[param_id]->mutable_cpu_data());
-    caffe_copy(net_params[param_id]->count(),
-        history_[param_id]->cpu_data(),
-        net_params[param_id]->mutable_cpu_diff());
-    break;
-  }
-  case Caffe::GPU: {
-#ifndef CPU_ONLY
-    sgd_update_gpu(net_params[param_id]->count(),
-        net_params[param_id]->mutable_gpu_diff(),
-        history_[param_id]->mutable_gpu_data(),
-        momentum, local_rate);
-#else
-    NO_GPU;
-#endif
-    break;
-  }
-  default:
-    LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode();
-  }
-}
-
-template <typename Dtype>
-void SGDSolver<Dtype>::SnapshotSolverState(const string& model_filename) {
-  switch (this->param_.snapshot_format()) {
-    case caffe::SolverParameter_SnapshotFormat_BINARYPROTO:
-      SnapshotSolverStateToBinaryProto(model_filename);
-      break;
-    case caffe::SolverParameter_SnapshotFormat_HDF5:
-      SnapshotSolverStateToHDF5(model_filename);
-      break;
-    default:
-      LOG(FATAL) << "Unsupported snapshot format.";
-  }
-}
-
-template <typename Dtype>
-void SGDSolver<Dtype>::SnapshotSolverStateToBinaryProto(
-    const string& model_filename) {
-  SolverState state;
-  state.set_iter(this->iter_);
-  state.set_learned_net(model_filename);
-  state.set_current_step(this->current_step_);
-  state.clear_history();
-  for (int i = 0; i < history_.size(); ++i) {
-    // Add history
-    BlobProto* history_blob = state.add_history();
-    history_[i]->ToProto(history_blob);
-  }
-  string snapshot_filename = Solver<Dtype>::SnapshotFilename(".solverstate");
-  LOG(INFO)
-    << "Snapshotting solver state to binary proto file " << snapshot_filename;
-  WriteProtoToBinaryFile(state, snapshot_filename.c_str());
-}
-
-template <typename Dtype>
-void SGDSolver<Dtype>::SnapshotSolverStateToHDF5(
-    const string& model_filename) {
-  string snapshot_filename =
-      Solver<Dtype>::SnapshotFilename(".solverstate.h5");
-  LOG(INFO) << "Snapshotting solver state to HDF5 file " << snapshot_filename;
-  hid_t file_hid = H5Fcreate(snapshot_filename.c_str(), H5F_ACC_TRUNC,
-      H5P_DEFAULT, H5P_DEFAULT);
-  CHECK_GE(file_hid, 0)
-      << "Couldn't open " << snapshot_filename << " to save solver state.";
-  hdf5_save_int(file_hid, "iter", this->iter_);
-  hdf5_save_string(file_hid, "learned_net", model_filename);
-  hdf5_save_int(file_hid, "current_step", this->current_step_);
-  hid_t history_hid = H5Gcreate2(file_hid, "history", H5P_DEFAULT, H5P_DEFAULT,
-      H5P_DEFAULT);
-  CHECK_GE(history_hid, 0)
-      << "Error saving solver state to " << snapshot_filename << ".";
-  for (int i = 0; i < history_.size(); ++i) {
-    ostringstream oss;
-    oss << i;
-    hdf5_save_nd_dataset<Dtype>(history_hid, oss.str(), *history_[i]);
-  }
-  H5Gclose(history_hid);
-  H5Fclose(file_hid);
-}
-
-template <typename Dtype>
-void SGDSolver<Dtype>::RestoreSolverStateFromBinaryProto(
-    const string& state_file) {
-  SolverState state;
-  ReadProtoFromBinaryFile(state_file, &state);
-  this->iter_ = state.iter();
-  if (state.has_learned_net()) {
-    NetParameter net_param;
-    ReadNetParamsFromBinaryFileOrDie(state.learned_net().c_str(), &net_param);
-    this->net_->CopyTrainedLayersFrom(net_param);
-  }
-  this->current_step_ = state.current_step();
-  CHECK_EQ(state.history_size(), history_.size())
-      << "Incorrect length of history blobs.";
-  LOG(INFO) << "SGDSolver: restoring history";
-  for (int i = 0; i < history_.size(); ++i) {
-    history_[i]->FromProto(state.history(i));
-  }
-}
-
-template <typename Dtype>
-void SGDSolver<Dtype>::RestoreSolverStateFromHDF5(const string& state_file) {
-  hid_t file_hid = H5Fopen(state_file.c_str(), H5F_ACC_RDONLY, H5P_DEFAULT);
-  CHECK_GE(file_hid, 0) << "Couldn't open solver state file " << state_file;
-  this->iter_ = hdf5_load_int(file_hid, "iter");
-  if (H5LTfind_dataset(file_hid, "learned_net")) {
-    string learned_net = hdf5_load_string(file_hid, "learned_net");
-    this->net_->CopyTrainedLayersFrom(learned_net);
-  }
-  this->current_step_ = hdf5_load_int(file_hid, "current_step");
-  hid_t history_hid = H5Gopen2(file_hid, "history", H5P_DEFAULT);
-  CHECK_GE(history_hid, 0) << "Error reading history from " << state_file;
-  int state_history_size = hdf5_get_num_links(history_hid);
-  CHECK_EQ(state_history_size, history_.size())
-      << "Incorrect length of history blobs.";
-  for (int i = 0; i < history_.size(); ++i) {
-    ostringstream oss;
-    oss << i;
-    hdf5_load_nd_dataset<Dtype>(history_hid, oss.str().c_str(), 0,
-                                kMaxBlobAxes, history_[i].get());
-  }
-  H5Gclose(history_hid);
-  H5Fclose(file_hid);
-}
-
-INSTANTIATE_CLASS(SGDSolver);
-REGISTER_SOLVER_CLASS(SGD);
-
-}  // namespace caffe
diff --git a/src/caffe/solvers/sgd_solver.cu b/src/caffe/solvers/sgd_solver.cu
deleted file mode 100644
index e541035..0000000
--- a/src/caffe/solvers/sgd_solver.cu
+++ /dev/null
@@ -1,24 +0,0 @@
-#include "caffe/util/math_functions.hpp"
-
-
-namespace caffe {
-
-template <typename Dtype>
-__global__ void SGDUpdate(int N, Dtype* g, Dtype* h,
-    Dtype momentum, Dtype local_rate) {
-  CUDA_KERNEL_LOOP(i, N) {
-    g[i] = h[i] = momentum*h[i] + local_rate*g[i];
-  }
-}
-template <typename Dtype>
-void sgd_update_gpu(int N, Dtype* g, Dtype* h, Dtype momentum,
-    Dtype local_rate) {
-  SGDUpdate<Dtype>  // NOLINT_NEXT_LINE(whitespace/operators)
-      <<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
-      N, g, h, momentum, local_rate);
-  CUDA_POST_KERNEL_CHECK;
-}
-template void sgd_update_gpu<float>(int, float*, float*, float, float);
-template void sgd_update_gpu<double>(int, double*, double*, double, double);
-
-}  // namespace caffe
diff --git a/src/caffe/syncedmem.cpp b/src/caffe/syncedmem.cpp
index 4d35641..3fbf7df 100644
--- a/src/caffe/syncedmem.cpp
+++ b/src/caffe/syncedmem.cpp
@@ -8,18 +8,6 @@ SyncedMemory::~SyncedMemory() {
   if (cpu_ptr_ && own_cpu_data_) {
     CaffeFreeHost(cpu_ptr_, cpu_malloc_use_cuda_);
   }
-
-#ifndef CPU_ONLY
-  if (gpu_ptr_ && own_gpu_data_) {
-    int initial_device;
-    cudaGetDevice(&initial_device);
-    if (gpu_device_ != -1) {
-      CUDA_CHECK(cudaSetDevice(gpu_device_));
-    }
-    CUDA_CHECK(cudaFree(gpu_ptr_));
-    cudaSetDevice(initial_device);
-  }
-#endif  // CPU_ONLY
 }
 
 inline void SyncedMemory::to_cpu() {
@@ -31,16 +19,7 @@ inline void SyncedMemory::to_cpu() {
     own_cpu_data_ = true;
     break;
   case HEAD_AT_GPU:
-#ifndef CPU_ONLY
-    if (cpu_ptr_ == NULL) {
-      CaffeMallocHost(&cpu_ptr_, size_, &cpu_malloc_use_cuda_);
-      own_cpu_data_ = true;
-    }
-    caffe_gpu_memcpy(size_, gpu_ptr_, cpu_ptr_);
-    head_ = SYNCED;
-#else
     NO_GPU;
-#endif
     break;
   case HEAD_AT_CPU:
   case SYNCED:
@@ -49,31 +28,7 @@ inline void SyncedMemory::to_cpu() {
 }
 
 inline void SyncedMemory::to_gpu() {
-#ifndef CPU_ONLY
-  switch (head_) {
-  case UNINITIALIZED:
-    CUDA_CHECK(cudaGetDevice(&gpu_device_));
-    CUDA_CHECK(cudaMalloc(&gpu_ptr_, size_));
-    caffe_gpu_memset(size_, 0, gpu_ptr_);
-    head_ = HEAD_AT_GPU;
-    own_gpu_data_ = true;
-    break;
-  case HEAD_AT_CPU:
-    if (gpu_ptr_ == NULL) {
-      CUDA_CHECK(cudaGetDevice(&gpu_device_));
-      CUDA_CHECK(cudaMalloc(&gpu_ptr_, size_));
-      own_gpu_data_ = true;
-    }
-    caffe_gpu_memcpy(size_, cpu_ptr_, gpu_ptr_);
-    head_ = SYNCED;
-    break;
-  case HEAD_AT_GPU:
-  case SYNCED:
-    break;
-  }
-#else
   NO_GPU;
-#endif
 }
 
 const void* SyncedMemory::cpu_data() {
@@ -92,33 +47,12 @@ void SyncedMemory::set_cpu_data(void* data) {
 }
 
 const void* SyncedMemory::gpu_data() {
-#ifndef CPU_ONLY
-  to_gpu();
-  return (const void*)gpu_ptr_;
-#else
   NO_GPU;
   return NULL;
-#endif
 }
 
 void SyncedMemory::set_gpu_data(void* data) {
-#ifndef CPU_ONLY
-  CHECK(data);
-  if (own_gpu_data_) {
-    int initial_device;
-    cudaGetDevice(&initial_device);
-    if (gpu_device_ != -1) {
-      CUDA_CHECK(cudaSetDevice(gpu_device_));
-    }
-    CUDA_CHECK(cudaFree(gpu_ptr_));
-    cudaSetDevice(initial_device);
-  }
-  gpu_ptr_ = data;
-  head_ = HEAD_AT_GPU;
-  own_gpu_data_ = false;
-#else
   NO_GPU;
-#endif
 }
 
 void* SyncedMemory::mutable_cpu_data() {
@@ -128,30 +62,9 @@ void* SyncedMemory::mutable_cpu_data() {
 }
 
 void* SyncedMemory::mutable_gpu_data() {
-#ifndef CPU_ONLY
-  to_gpu();
-  head_ = HEAD_AT_GPU;
-  return gpu_ptr_;
-#else
   NO_GPU;
   return NULL;
-#endif
-}
-
-#ifndef CPU_ONLY
-void SyncedMemory::async_gpu_push(const cudaStream_t& stream) {
-  CHECK(head_ == HEAD_AT_CPU);
-  if (gpu_ptr_ == NULL) {
-    CUDA_CHECK(cudaGetDevice(&gpu_device_));
-    CUDA_CHECK(cudaMalloc(&gpu_ptr_, size_));
-    own_gpu_data_ = true;
-  }
-  const cudaMemcpyKind put = cudaMemcpyHostToDevice;
-  CUDA_CHECK(cudaMemcpyAsync(gpu_ptr_, cpu_ptr_, size_, put, stream));
-  // Assume caller will synchronize on the stream before use
-  head_ = SYNCED;
 }
-#endif
 
 }  // namespace caffe
 
diff --git a/src/caffe/test/CMakeLists.txt b/src/caffe/test/CMakeLists.txt
deleted file mode 100644
index 35a803f..0000000
--- a/src/caffe/test/CMakeLists.txt
+++ /dev/null
@@ -1,36 +0,0 @@
-# The option allows to include in build only selected test files and exclude all others
-# Usage example:
-#  cmake -DBUILD_only_tests="common,net,blob,im2col_kernel"
-set(BUILD_only_tests "" CACHE STRING "Blank or comma-separated list of test files to build without 'test_' prefix and extention")
-caffe_leave_only_selected_tests(test_srcs ${BUILD_only_tests})
-caffe_leave_only_selected_tests(test_cuda ${BUILD_only_tests})
-
-# For 'make runtest' target we don't need to embed test data paths to
-# source files, because test target is executed in source directory
-# That's why the lines below are commented. TODO: remove them
-
-# definition needed to include CMake generated files
-#add_definitions(-DCMAKE_BUILD)
-
-# generates test_data/sample_data_list.txt.gen.cmake
-#caffe_configure_testdatafile(test_data/sample_data_list.txt)
-
-set(the_target test.testbin)
-set(test_args --gtest_shuffle)
-
-if(HAVE_CUDA)
-  caffe_cuda_compile(test_cuda_objs ${test_cuda})
-  list(APPEND test_srcs ${test_cuda_objs} ${test_cuda})
-else()
-  list(APPEND test_args --gtest_filter="-*GPU*")
-endif()
-
-# ---[ Adding test target
-add_executable(${the_target} EXCLUDE_FROM_ALL ${test_srcs})
-target_link_libraries(${the_target} gtest ${Caffe_LINK})
-caffe_default_properties(${the_target})
-caffe_set_runtime_directory(${the_target} "${PROJECT_BINARY_DIR}/test")
-
-# ---[ Adding runtest
-add_custom_target(runtest COMMAND ${the_target} ${test_args}
-                          WORKING_DIRECTORY ${PROJECT_SOURCE_DIR})
diff --git a/src/caffe/test/test_accuracy_layer.cpp b/src/caffe/test/test_accuracy_layer.cpp
deleted file mode 100644
index 6fe808b..0000000
--- a/src/caffe/test/test_accuracy_layer.cpp
+++ /dev/null
@@ -1,336 +0,0 @@
-#include <cfloat>
-#include <vector>
-
-#include "gtest/gtest.h"
-
-#include "caffe/blob.hpp"
-#include "caffe/common.hpp"
-#include "caffe/filler.hpp"
-#include "caffe/layers/accuracy_layer.hpp"
-#include "caffe/util/rng.hpp"
-
-#include "caffe/test/test_caffe_main.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-class AccuracyLayerTest : public CPUDeviceTest<Dtype> {
- protected:
-  AccuracyLayerTest()
-      : blob_bottom_data_(new Blob<Dtype>()),
-        blob_bottom_label_(new Blob<Dtype>()),
-        blob_top_(new Blob<Dtype>()),
-        blob_top_per_class_(new Blob<Dtype>()),
-        top_k_(3) {
-    vector<int> shape(2);
-    shape[0] = 100;
-    shape[1] = 10;
-    blob_bottom_data_->Reshape(shape);
-    shape.resize(1);
-    blob_bottom_label_->Reshape(shape);
-    FillBottoms();
-
-    blob_bottom_vec_.push_back(blob_bottom_data_);
-    blob_bottom_vec_.push_back(blob_bottom_label_);
-    blob_top_vec_.push_back(blob_top_);
-    blob_top_per_class_vec_.push_back(blob_top_);
-    blob_top_per_class_vec_.push_back(blob_top_per_class_);
-  }
-
-  virtual void FillBottoms() {
-    // fill the probability values
-    FillerParameter filler_param;
-    GaussianFiller<Dtype> filler(filler_param);
-    filler.Fill(this->blob_bottom_data_);
-
-    const unsigned int prefetch_rng_seed = caffe_rng_rand();
-    shared_ptr<Caffe::RNG> rng(new Caffe::RNG(prefetch_rng_seed));
-    caffe::rng_t* prefetch_rng =
-          static_cast<caffe::rng_t*>(rng->generator());
-    Dtype* label_data = blob_bottom_label_->mutable_cpu_data();
-    for (int i = 0; i < blob_bottom_label_->count(); ++i) {
-      label_data[i] = (*prefetch_rng)() % 10;
-    }
-  }
-
-  virtual ~AccuracyLayerTest() {
-    delete blob_bottom_data_;
-    delete blob_bottom_label_;
-    delete blob_top_;
-    delete blob_top_per_class_;
-  }
-  Blob<Dtype>* const blob_bottom_data_;
-  Blob<Dtype>* const blob_bottom_label_;
-  Blob<Dtype>* const blob_top_;
-  Blob<Dtype>* const blob_top_per_class_;
-  vector<Blob<Dtype>*> blob_bottom_vec_;
-  vector<Blob<Dtype>*> blob_top_vec_;
-  vector<Blob<Dtype>*> blob_top_per_class_vec_;
-  int top_k_;
-};
-
-TYPED_TEST_CASE(AccuracyLayerTest, TestDtypes);
-
-TYPED_TEST(AccuracyLayerTest, TestSetup) {
-  LayerParameter layer_param;
-  AccuracyLayer<TypeParam> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  EXPECT_EQ(this->blob_top_->num(), 1);
-  EXPECT_EQ(this->blob_top_->channels(), 1);
-  EXPECT_EQ(this->blob_top_->height(), 1);
-  EXPECT_EQ(this->blob_top_->width(), 1);
-}
-
-TYPED_TEST(AccuracyLayerTest, TestSetupTopK) {
-  LayerParameter layer_param;
-  AccuracyParameter* accuracy_param =
-      layer_param.mutable_accuracy_param();
-  accuracy_param->set_top_k(5);
-  AccuracyLayer<TypeParam> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  EXPECT_EQ(this->blob_top_->num(), 1);
-  EXPECT_EQ(this->blob_top_->channels(), 1);
-  EXPECT_EQ(this->blob_top_->height(), 1);
-  EXPECT_EQ(this->blob_top_->width(), 1);
-}
-
-TYPED_TEST(AccuracyLayerTest, TestSetupOutputPerClass) {
-  LayerParameter layer_param;
-  AccuracyLayer<TypeParam> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_per_class_vec_);
-  EXPECT_EQ(this->blob_top_->num(), 1);
-  EXPECT_EQ(this->blob_top_->channels(), 1);
-  EXPECT_EQ(this->blob_top_->height(), 1);
-  EXPECT_EQ(this->blob_top_->width(), 1);
-  EXPECT_EQ(this->blob_top_per_class_->num(), 10);
-  EXPECT_EQ(this->blob_top_per_class_->channels(), 1);
-  EXPECT_EQ(this->blob_top_per_class_->height(), 1);
-  EXPECT_EQ(this->blob_top_per_class_->width(), 1);
-}
-
-TYPED_TEST(AccuracyLayerTest, TestForwardCPU) {
-  LayerParameter layer_param;
-  AccuracyLayer<TypeParam> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-
-  TypeParam max_value;
-  int max_id;
-  int num_correct_labels = 0;
-  for (int i = 0; i < 100; ++i) {
-    max_value = -FLT_MAX;
-    max_id = 0;
-    for (int j = 0; j < 10; ++j) {
-      if (this->blob_bottom_data_->data_at(i, j, 0, 0) > max_value) {
-        max_value = this->blob_bottom_data_->data_at(i, j, 0, 0);
-        max_id = j;
-      }
-    }
-    if (max_id == this->blob_bottom_label_->data_at(i, 0, 0, 0)) {
-      ++num_correct_labels;
-    }
-  }
-  EXPECT_NEAR(this->blob_top_->data_at(0, 0, 0, 0),
-              num_correct_labels / 100.0, 1e-4);
-}
-
-TYPED_TEST(AccuracyLayerTest, TestForwardWithSpatialAxes) {
-  this->blob_bottom_data_->Reshape(2, 10, 4, 5);
-  vector<int> label_shape(3);
-  label_shape[0] = 2; label_shape[1] = 4; label_shape[2] = 5;
-  this->blob_bottom_label_->Reshape(label_shape);
-  this->FillBottoms();
-  LayerParameter layer_param;
-  layer_param.mutable_accuracy_param()->set_axis(1);
-  AccuracyLayer<TypeParam> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-
-  TypeParam max_value;
-  const int num_labels = this->blob_bottom_label_->count();
-  int max_id;
-  int num_correct_labels = 0;
-  vector<int> label_offset(3);
-  for (int n = 0; n < this->blob_bottom_data_->num(); ++n) {
-    for (int h = 0; h < this->blob_bottom_data_->height(); ++h) {
-      for (int w = 0; w < this->blob_bottom_data_->width(); ++w) {
-        max_value = -FLT_MAX;
-        max_id = 0;
-        for (int c = 0; c < this->blob_bottom_data_->channels(); ++c) {
-          const TypeParam pred_value =
-              this->blob_bottom_data_->data_at(n, c, h, w);
-          if (pred_value > max_value) {
-            max_value = pred_value;
-            max_id = c;
-          }
-        }
-        label_offset[0] = n; label_offset[1] = h; label_offset[2] = w;
-        const int correct_label =
-            static_cast<int>(this->blob_bottom_label_->data_at(label_offset));
-        if (max_id == correct_label) {
-          ++num_correct_labels;
-        }
-      }
-    }
-  }
-  EXPECT_NEAR(this->blob_top_->data_at(0, 0, 0, 0),
-              num_correct_labels / TypeParam(num_labels), 1e-4);
-}
-
-TYPED_TEST(AccuracyLayerTest, TestForwardIgnoreLabel) {
-  LayerParameter layer_param;
-  const TypeParam kIgnoreLabelValue = -1;
-  layer_param.mutable_accuracy_param()->set_ignore_label(kIgnoreLabelValue);
-  AccuracyLayer<TypeParam> layer(layer_param);
-  // Manually set some labels to the ignore label value (-1).
-  this->blob_bottom_label_->mutable_cpu_data()[2] = kIgnoreLabelValue;
-  this->blob_bottom_label_->mutable_cpu_data()[5] = kIgnoreLabelValue;
-  this->blob_bottom_label_->mutable_cpu_data()[32] = kIgnoreLabelValue;
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-
-  TypeParam max_value;
-  int max_id;
-  int num_correct_labels = 0;
-  int count = 0;
-  for (int i = 0; i < 100; ++i) {
-    if (kIgnoreLabelValue == this->blob_bottom_label_->data_at(i, 0, 0, 0)) {
-      continue;
-    }
-    ++count;
-    max_value = -FLT_MAX;
-    max_id = 0;
-    for (int j = 0; j < 10; ++j) {
-      if (this->blob_bottom_data_->data_at(i, j, 0, 0) > max_value) {
-        max_value = this->blob_bottom_data_->data_at(i, j, 0, 0);
-        max_id = j;
-      }
-    }
-    if (max_id == this->blob_bottom_label_->data_at(i, 0, 0, 0)) {
-      ++num_correct_labels;
-    }
-  }
-  EXPECT_EQ(count, 97);  // We set 3 out of 100 labels to kIgnoreLabelValue.
-  EXPECT_NEAR(this->blob_top_->data_at(0, 0, 0, 0),
-              num_correct_labels / TypeParam(count), 1e-4);
-}
-
-TYPED_TEST(AccuracyLayerTest, TestForwardCPUTopK) {
-  LayerParameter layer_param;
-  AccuracyParameter* accuracy_param = layer_param.mutable_accuracy_param();
-  accuracy_param->set_top_k(this->top_k_);
-  AccuracyLayer<TypeParam> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-
-  TypeParam current_value;
-  int current_rank;
-  int num_correct_labels = 0;
-  for (int i = 0; i < 100; ++i) {
-    for (int j = 0; j < 10; ++j) {
-      current_value = this->blob_bottom_data_->data_at(i, j, 0, 0);
-      current_rank = 0;
-      for (int k = 0; k < 10; ++k) {
-        if (this->blob_bottom_data_->data_at(i, k, 0, 0) > current_value) {
-          ++current_rank;
-        }
-      }
-      if (current_rank < this->top_k_ &&
-          j == this->blob_bottom_label_->data_at(i, 0, 0, 0)) {
-        ++num_correct_labels;
-      }
-    }
-  }
-
-  EXPECT_NEAR(this->blob_top_->data_at(0, 0, 0, 0),
-              num_correct_labels / 100.0, 1e-4);
-}
-
-TYPED_TEST(AccuracyLayerTest, TestForwardCPUPerClass) {
-  LayerParameter layer_param;
-  AccuracyLayer<TypeParam> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_per_class_vec_);
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_per_class_vec_);
-
-  TypeParam max_value;
-  int max_id;
-  int num_correct_labels = 0;
-  const int num_class = this->blob_top_per_class_->num();
-  vector<int> correct_per_class(num_class, 0);
-  vector<int> num_per_class(num_class, 0);
-  for (int i = 0; i < 100; ++i) {
-    max_value = -FLT_MAX;
-    max_id = 0;
-    for (int j = 0; j < 10; ++j) {
-      if (this->blob_bottom_data_->data_at(i, j, 0, 0) > max_value) {
-        max_value = this->blob_bottom_data_->data_at(i, j, 0, 0);
-        max_id = j;
-      }
-    }
-    ++num_per_class[this->blob_bottom_label_->data_at(i, 0, 0, 0)];
-    if (max_id == this->blob_bottom_label_->data_at(i, 0, 0, 0)) {
-      ++num_correct_labels;
-      ++correct_per_class[max_id];
-    }
-  }
-  EXPECT_NEAR(this->blob_top_->data_at(0, 0, 0, 0),
-              num_correct_labels / 100.0, 1e-4);
-  for (int i = 0; i < num_class; ++i) {
-    TypeParam accuracy_per_class = (num_per_class[i] > 0 ?
-       static_cast<TypeParam>(correct_per_class[i]) / num_per_class[i] : 0);
-    EXPECT_NEAR(this->blob_top_per_class_->data_at(i, 0, 0, 0),
-                accuracy_per_class, 1e-4);
-  }
-}
-
-
-TYPED_TEST(AccuracyLayerTest, TestForwardCPUPerClassWithIgnoreLabel) {
-  LayerParameter layer_param;
-  const TypeParam kIgnoreLabelValue = -1;
-  layer_param.mutable_accuracy_param()->set_ignore_label(kIgnoreLabelValue);
-  AccuracyLayer<TypeParam> layer(layer_param);
-  // Manually set some labels to the ignore label value (-1).
-  this->blob_bottom_label_->mutable_cpu_data()[2] = kIgnoreLabelValue;
-  this->blob_bottom_label_->mutable_cpu_data()[5] = kIgnoreLabelValue;
-  this->blob_bottom_label_->mutable_cpu_data()[32] = kIgnoreLabelValue;
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_per_class_vec_);
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_per_class_vec_);
-
-  TypeParam max_value;
-  int max_id;
-  int num_correct_labels = 0;
-  const int num_class = this->blob_top_per_class_->num();
-  vector<int> correct_per_class(num_class, 0);
-  vector<int> num_per_class(num_class, 0);
-  int count = 0;
-  for (int i = 0; i < 100; ++i) {
-    if (kIgnoreLabelValue == this->blob_bottom_label_->data_at(i, 0, 0, 0)) {
-      continue;
-    }
-    ++count;
-    max_value = -FLT_MAX;
-    max_id = 0;
-    for (int j = 0; j < 10; ++j) {
-      if (this->blob_bottom_data_->data_at(i, j, 0, 0) > max_value) {
-        max_value = this->blob_bottom_data_->data_at(i, j, 0, 0);
-        max_id = j;
-      }
-    }
-    ++num_per_class[this->blob_bottom_label_->data_at(i, 0, 0, 0)];
-    if (max_id == this->blob_bottom_label_->data_at(i, 0, 0, 0)) {
-      ++num_correct_labels;
-      ++correct_per_class[max_id];
-    }
-  }
-  EXPECT_EQ(count, 97);
-  EXPECT_NEAR(this->blob_top_->data_at(0, 0, 0, 0),
-              num_correct_labels / TypeParam(count), 1e-4);
-  for (int i = 0; i < 10; ++i) {
-    TypeParam accuracy_per_class = (num_per_class[i] > 0 ?
-       static_cast<TypeParam>(correct_per_class[i]) / num_per_class[i] : 0);
-    EXPECT_NEAR(this->blob_top_per_class_->data_at(i, 0, 0, 0),
-                accuracy_per_class, 1e-4);
-  }
-}
-
-}  // namespace caffe
diff --git a/src/caffe/test/test_argmax_layer.cpp b/src/caffe/test/test_argmax_layer.cpp
deleted file mode 100644
index 472e665..0000000
--- a/src/caffe/test/test_argmax_layer.cpp
+++ /dev/null
@@ -1,295 +0,0 @@
-#include <utility>
-#include <vector>
-
-#include "gtest/gtest.h"
-
-#include "caffe/blob.hpp"
-#include "caffe/common.hpp"
-#include "caffe/filler.hpp"
-#include "caffe/layers/argmax_layer.hpp"
-
-#include "caffe/test/test_caffe_main.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-class ArgMaxLayerTest : public CPUDeviceTest<Dtype> {
- protected:
-  ArgMaxLayerTest()
-      : blob_bottom_(new Blob<Dtype>(10, 10, 20, 20)),
-        blob_top_(new Blob<Dtype>()),
-        top_k_(5) {
-    Caffe::set_random_seed(1701);
-    // fill the values
-    FillerParameter filler_param;
-    GaussianFiller<Dtype> filler(filler_param);
-    filler.Fill(this->blob_bottom_);
-    blob_bottom_vec_.push_back(blob_bottom_);
-    blob_top_vec_.push_back(blob_top_);
-  }
-  virtual ~ArgMaxLayerTest() { delete blob_bottom_; delete blob_top_; }
-  Blob<Dtype>* const blob_bottom_;
-  Blob<Dtype>* const blob_top_;
-  vector<Blob<Dtype>*> blob_bottom_vec_;
-  vector<Blob<Dtype>*> blob_top_vec_;
-  size_t top_k_;
-};
-
-TYPED_TEST_CASE(ArgMaxLayerTest, TestDtypes);
-
-TYPED_TEST(ArgMaxLayerTest, TestSetup) {
-  LayerParameter layer_param;
-  ArgMaxLayer<TypeParam> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  EXPECT_EQ(this->blob_top_->num(), this->blob_bottom_->num());
-  EXPECT_EQ(this->blob_top_->channels(), 1);
-}
-
-TYPED_TEST(ArgMaxLayerTest, TestSetupMaxVal) {
-  LayerParameter layer_param;
-  ArgMaxParameter* argmax_param = layer_param.mutable_argmax_param();
-  argmax_param->set_out_max_val(true);
-  ArgMaxLayer<TypeParam> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  EXPECT_EQ(this->blob_top_->num(), this->blob_bottom_->num());
-  EXPECT_EQ(this->blob_top_->channels(), 2);
-}
-
-TYPED_TEST(ArgMaxLayerTest, TestSetupAxis) {
-  LayerParameter layer_param;
-  ArgMaxParameter* argmax_param = layer_param.mutable_argmax_param();
-  argmax_param->set_axis(0);
-  ArgMaxLayer<TypeParam> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  EXPECT_EQ(this->blob_top_->shape(0), argmax_param->top_k());
-  EXPECT_EQ(this->blob_top_->shape(1), this->blob_bottom_->shape(0));
-  EXPECT_EQ(this->blob_top_->shape(2), this->blob_bottom_->shape(2));
-  EXPECT_EQ(this->blob_top_->shape(3), this->blob_bottom_->shape(3));
-}
-
-TYPED_TEST(ArgMaxLayerTest, TestSetupAxisNegativeIndexing) {
-  LayerParameter layer_param;
-  ArgMaxParameter* argmax_param = layer_param.mutable_argmax_param();
-  argmax_param->set_axis(-2);
-  ArgMaxLayer<TypeParam> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  EXPECT_EQ(this->blob_top_->shape(0), this->blob_bottom_->shape(0));
-  EXPECT_EQ(this->blob_top_->shape(1), this->blob_bottom_->shape(1));
-  EXPECT_EQ(this->blob_top_->shape(2), argmax_param->top_k());
-  EXPECT_EQ(this->blob_top_->shape(3), this->blob_bottom_->shape(3));
-}
-
-TYPED_TEST(ArgMaxLayerTest, TestSetupAxisMaxVal) {
-  LayerParameter layer_param;
-  ArgMaxParameter* argmax_param = layer_param.mutable_argmax_param();
-  argmax_param->set_axis(2);
-  argmax_param->set_out_max_val(true);
-  ArgMaxLayer<TypeParam> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  EXPECT_EQ(this->blob_top_->shape(0), this->blob_bottom_->shape(0));
-  EXPECT_EQ(this->blob_top_->shape(1), this->blob_bottom_->shape(1));
-  EXPECT_EQ(this->blob_top_->shape(2), argmax_param->top_k());
-  EXPECT_EQ(this->blob_top_->shape(3), this->blob_bottom_->shape(3));
-}
-
-TYPED_TEST(ArgMaxLayerTest, TestCPU) {
-  LayerParameter layer_param;
-  ArgMaxLayer<TypeParam> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  // Now, check values
-  const TypeParam* bottom_data = this->blob_bottom_->cpu_data();
-  const TypeParam* top_data = this->blob_top_->cpu_data();
-  int max_ind;
-  TypeParam max_val;
-  int num = this->blob_bottom_->num();
-  int dim = this->blob_bottom_->count() / num;
-  for (int i = 0; i < num; ++i) {
-    EXPECT_GE(top_data[i], 0);
-    EXPECT_LE(top_data[i], dim);
-    max_ind = top_data[i];
-    max_val = bottom_data[i * dim + max_ind];
-    for (int j = 0; j < dim; ++j) {
-      EXPECT_LE(bottom_data[i * dim + j], max_val);
-    }
-  }
-}
-
-TYPED_TEST(ArgMaxLayerTest, TestCPUMaxVal) {
-  LayerParameter layer_param;
-  ArgMaxParameter* argmax_param = layer_param.mutable_argmax_param();
-  argmax_param->set_out_max_val(true);
-  ArgMaxLayer<TypeParam> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  // Now, check values
-  const TypeParam* bottom_data = this->blob_bottom_->cpu_data();
-  const TypeParam* top_data = this->blob_top_->cpu_data();
-  int max_ind;
-  TypeParam max_val;
-  int num = this->blob_bottom_->num();
-  int dim = this->blob_bottom_->count() / num;
-  for (int i = 0; i < num; ++i) {
-    EXPECT_GE(top_data[i], 0);
-    EXPECT_LE(top_data[i], dim);
-    max_ind = top_data[i * 2];
-    max_val = top_data[i * 2 + 1];
-    EXPECT_EQ(bottom_data[i * dim + max_ind], max_val);
-    for (int j = 0; j < dim; ++j) {
-      EXPECT_LE(bottom_data[i * dim + j], max_val);
-    }
-  }
-}
-
-TYPED_TEST(ArgMaxLayerTest, TestCPUTopK) {
-  LayerParameter layer_param;
-  ArgMaxParameter* argmax_param = layer_param.mutable_argmax_param();
-  argmax_param->set_top_k(this->top_k_);
-  ArgMaxLayer<TypeParam> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  // Now, check values
-  const TypeParam* bottom_data = this->blob_bottom_->cpu_data();
-  int max_ind;
-  TypeParam max_val;
-  int num = this->blob_bottom_->num();
-  int dim = this->blob_bottom_->count() / num;
-  for (int i = 0; i < num; ++i) {
-    EXPECT_GE(this->blob_top_->data_at(i, 0, 0, 0), 0);
-    EXPECT_LE(this->blob_top_->data_at(i, 0, 0, 0), dim);
-    for (int j = 0; j < this->top_k_; ++j) {
-      max_ind = this->blob_top_->data_at(i, 0, j, 0);
-      max_val = bottom_data[i * dim + max_ind];
-      int count = 0;
-      for (int k = 0; k < dim; ++k) {
-        if (bottom_data[i * dim + k] > max_val) {
-          ++count;
-        }
-      }
-      EXPECT_EQ(j, count);
-    }
-  }
-}
-
-TYPED_TEST(ArgMaxLayerTest, TestCPUMaxValTopK) {
-  LayerParameter layer_param;
-  ArgMaxParameter* argmax_param = layer_param.mutable_argmax_param();
-  argmax_param->set_out_max_val(true);
-  argmax_param->set_top_k(this->top_k_);
-  ArgMaxLayer<TypeParam> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  // Now, check values
-  const TypeParam* bottom_data = this->blob_bottom_->cpu_data();
-  int max_ind;
-  TypeParam max_val;
-  int num = this->blob_bottom_->num();
-  int dim = this->blob_bottom_->count() / num;
-  for (int i = 0; i < num; ++i) {
-    EXPECT_GE(this->blob_top_->data_at(i, 0, 0, 0), 0);
-    EXPECT_LE(this->blob_top_->data_at(i, 0, 0, 0), dim);
-    for (int j = 0; j < this->top_k_; ++j) {
-      max_ind = this->blob_top_->data_at(i, 0, j, 0);
-      max_val = this->blob_top_->data_at(i, 1, j, 0);
-      EXPECT_EQ(bottom_data[i * dim + max_ind], max_val);
-      int count = 0;
-      for (int k = 0; k < dim; ++k) {
-        if (bottom_data[i * dim + k] > max_val) {
-          ++count;
-        }
-      }
-      EXPECT_EQ(j, count);
-    }
-  }
-}
-
-TYPED_TEST(ArgMaxLayerTest, TestCPUAxis) {
-  LayerParameter layer_param;
-  ArgMaxParameter* argmax_param = layer_param.mutable_argmax_param();
-  argmax_param->set_axis(0);
-  ArgMaxLayer<TypeParam> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  // Now, check values
-  int max_ind;
-  TypeParam max_val;
-  std::vector<int> shape = this->blob_bottom_->shape();
-  for (int i = 0; i < shape[1]; ++i) {
-    for (int j = 0; j < shape[2]; ++j) {
-      for (int k = 0; k < shape[3]; ++k) {
-        max_ind = this->blob_top_->data_at(0, i, j, k);
-        max_val = this->blob_bottom_->data_at(max_ind, i, j, k);
-        EXPECT_GE(max_ind, 0);
-        EXPECT_LE(max_ind, shape[0]);
-        for (int l = 0; l < shape[0]; ++l) {
-          EXPECT_LE(this->blob_bottom_->data_at(l, i, j, k), max_val);
-        }
-      }
-    }
-  }
-}
-
-TYPED_TEST(ArgMaxLayerTest, TestCPUAxisTopK) {
-  LayerParameter layer_param;
-  ArgMaxParameter* argmax_param = layer_param.mutable_argmax_param();
-  argmax_param->set_axis(2);
-  argmax_param->set_top_k(this->top_k_);
-  ArgMaxLayer<TypeParam> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  // Now, check values
-  int max_ind;
-  TypeParam max_val;
-  std::vector<int> shape = this->blob_bottom_->shape();
-  for (int i = 0; i < shape[0]; ++i) {
-    for (int j = 0; j < shape[1]; ++j) {
-      for (int k = 0; k < shape[3]; ++k) {
-        for (int m = 0; m < this->top_k_; ++m) {
-          max_ind = this->blob_top_->data_at(i, j, m, k);
-          max_val = this->blob_bottom_->data_at(i, j, max_ind, k);
-          EXPECT_GE(max_ind, 0);
-          EXPECT_LE(max_ind, shape[2]);
-          int count = 0;
-          for (int l = 0; l < shape[2]; ++l) {
-            if (this->blob_bottom_->data_at(i, j, l, k) > max_val) {
-              ++count;
-            }
-          }
-          EXPECT_EQ(m, count);
-        }
-      }
-    }
-  }
-}
-
-TYPED_TEST(ArgMaxLayerTest, TestCPUAxisMaxValTopK) {
-  LayerParameter layer_param;
-  ArgMaxParameter* argmax_param = layer_param.mutable_argmax_param();
-  argmax_param->set_axis(-1);
-  argmax_param->set_top_k(this->top_k_);
-  argmax_param->set_out_max_val(true);
-  ArgMaxLayer<TypeParam> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  // Now, check values
-  TypeParam max_val;
-  std::vector<int> shape = this->blob_bottom_->shape();
-  for (int i = 0; i < shape[0]; ++i) {
-    for (int j = 0; j < shape[1]; ++j) {
-      for (int k = 0; k < shape[2]; ++k) {
-        for (int m = 0; m < this->top_k_; ++m) {
-          max_val = this->blob_top_->data_at(i, j, k, m);
-          int count = 0;
-          for (int l = 0; l < shape[3]; ++l) {
-            if (this->blob_bottom_->data_at(i, j, k, l) > max_val) {
-              ++count;
-            }
-          }
-          EXPECT_EQ(m, count);
-        }
-      }
-    }
-  }
-}
-
-}  // namespace caffe
diff --git a/src/caffe/test/test_batch_norm_layer.cpp b/src/caffe/test/test_batch_norm_layer.cpp
deleted file mode 100644
index 936b93a..0000000
--- a/src/caffe/test/test_batch_norm_layer.cpp
+++ /dev/null
@@ -1,133 +0,0 @@
-#include <algorithm>
-#include <cstring>
-#include <vector>
-
-#include "gtest/gtest.h"
-
-#include "caffe/blob.hpp"
-#include "caffe/common.hpp"
-#include "caffe/filler.hpp"
-#include "caffe/layers/batch_norm_layer.hpp"
-
-#include "caffe/test/test_caffe_main.hpp"
-#include "caffe/test/test_gradient_check_util.hpp"
-
-#define BATCH_SIZE 2
-#define INPUT_DATA_SIZE 3
-
-namespace caffe {
-
-  template <typename TypeParam>
-  class BatchNormLayerTest : public MultiDeviceTest<TypeParam> {
-    typedef typename TypeParam::Dtype Dtype;
-   protected:
-    BatchNormLayerTest()
-        : blob_bottom_(new Blob<Dtype>(5, 2, 3, 4)),
-          blob_top_(new Blob<Dtype>()) {
-      // fill the values
-      FillerParameter filler_param;
-      GaussianFiller<Dtype> filler(filler_param);
-      filler.Fill(this->blob_bottom_);
-      blob_bottom_vec_.push_back(blob_bottom_);
-      blob_top_vec_.push_back(blob_top_);
-    }
-    virtual ~BatchNormLayerTest() { delete blob_bottom_; delete blob_top_; }
-    Blob<Dtype>* const blob_bottom_;
-    Blob<Dtype>* const blob_top_;
-    vector<Blob<Dtype>*> blob_bottom_vec_;
-    vector<Blob<Dtype>*> blob_top_vec_;
-  };
-
-  TYPED_TEST_CASE(BatchNormLayerTest, TestDtypesAndDevices);
-
-  TYPED_TEST(BatchNormLayerTest, TestForward) {
-    typedef typename TypeParam::Dtype Dtype;
-    LayerParameter layer_param;
-
-    BatchNormLayer<Dtype> layer(layer_param);
-    layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-    layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-
-    // Test mean
-    int num = this->blob_bottom_->num();
-    int channels = this->blob_bottom_->channels();
-    int height = this->blob_bottom_->height();
-    int width = this->blob_bottom_->width();
-
-    for (int j = 0; j < channels; ++j) {
-      Dtype sum = 0, var = 0;
-      for (int i = 0; i < num; ++i) {
-        for ( int k = 0; k < height; ++k ) {
-          for ( int l = 0; l < width; ++l ) {
-            Dtype data = this->blob_top_->data_at(i, j, k, l);
-            sum += data;
-            var += data * data;
-          }
-        }
-      }
-      sum /= height * width * num;
-      var /= height * width * num;
-
-      const Dtype kErrorBound = 0.001;
-      // expect zero mean
-      EXPECT_NEAR(0, sum, kErrorBound);
-      // expect unit variance
-      EXPECT_NEAR(1, var, kErrorBound);
-    }
-  }
-
-  TYPED_TEST(BatchNormLayerTest, TestForwardInplace) {
-    typedef typename TypeParam::Dtype Dtype;
-    Blob<Dtype> blob_inplace(5, 2, 3, 4);
-    vector<Blob<Dtype>*> blob_bottom_vec;
-    vector<Blob<Dtype>*> blob_top_vec;
-    LayerParameter layer_param;
-    FillerParameter filler_param;
-    GaussianFiller<Dtype> filler(filler_param);
-    filler.Fill(&blob_inplace);
-    blob_bottom_vec.push_back(&blob_inplace);
-    blob_top_vec.push_back(&blob_inplace);
-
-    BatchNormLayer<Dtype> layer(layer_param);
-    layer.SetUp(blob_bottom_vec, blob_top_vec);
-    layer.Forward(blob_bottom_vec, blob_top_vec);
-
-    // Test mean
-    int num = blob_inplace.num();
-    int channels = blob_inplace.channels();
-    int height = blob_inplace.height();
-    int width = blob_inplace.width();
-
-    for (int j = 0; j < channels; ++j) {
-      Dtype sum = 0, var = 0;
-      for (int i = 0; i < num; ++i) {
-        for ( int k = 0; k < height; ++k ) {
-          for ( int l = 0; l < width; ++l ) {
-            Dtype data = blob_inplace.data_at(i, j, k, l);
-            sum += data;
-            var += data * data;
-          }
-        }
-      }
-      sum /= height * width * num;
-      var /= height * width * num;
-
-      const Dtype kErrorBound = 0.001;
-      // expect zero mean
-      EXPECT_NEAR(0, sum, kErrorBound);
-      // expect unit variance
-      EXPECT_NEAR(1, var, kErrorBound);
-    }
-  }
-
-  TYPED_TEST(BatchNormLayerTest, TestGradient) {
-    typedef typename TypeParam::Dtype Dtype;
-    LayerParameter layer_param;
-
-    BatchNormLayer<Dtype> layer(layer_param);
-    GradientChecker<Dtype> checker(1e-2, 1e-4);
-    checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-        this->blob_top_vec_);
-  }
-
-}  // namespace caffe
diff --git a/src/caffe/test/test_batch_reindex_layer.cpp b/src/caffe/test/test_batch_reindex_layer.cpp
deleted file mode 100644
index 9ea1a2f..0000000
--- a/src/caffe/test/test_batch_reindex_layer.cpp
+++ /dev/null
@@ -1,118 +0,0 @@
-#include <vector>
-
-#include "gtest/gtest.h"
-
-#include "caffe/blob.hpp"
-#include "caffe/common.hpp"
-#include "caffe/filler.hpp"
-#include "caffe/layers/batch_reindex_layer.hpp"
-
-#include "caffe/test/test_caffe_main.hpp"
-#include "caffe/test/test_gradient_check_util.hpp"
-
-namespace caffe {
-
-template<typename TypeParam>
-class BatchReindexLayerTest : public MultiDeviceTest<TypeParam> {
-  typedef typename TypeParam::Dtype Dtype;
-
- protected:
-  BatchReindexLayerTest()
-      : blob_bottom_(new Blob<Dtype>()),
-        blob_bottom_permute_(new Blob<Dtype>()),
-        blob_top_(new Blob<Dtype>()) {
-  }
-  virtual void SetUp() {
-    Caffe::set_random_seed(1701);
-    vector<int> sz;
-    sz.push_back(5);
-    sz.push_back(4);
-    sz.push_back(3);
-    sz.push_back(2);
-    blob_bottom_->Reshape(sz);
-    vector<int> permsz;
-    permsz.push_back(6);
-    blob_bottom_permute_->Reshape(permsz);
-
-    // fill the values
-    FillerParameter filler_param;
-    GaussianFiller<Dtype> filler(filler_param);
-    filler.Fill(this->blob_bottom_);
-    int perm[] = { 4, 0, 4, 0, 1, 2 };
-    for (int i = 0; i < blob_bottom_permute_->count(); ++i) {
-      blob_bottom_permute_->mutable_cpu_data()[i] = perm[i];
-    }
-
-    blob_bottom_vec_.push_back(blob_bottom_);
-    blob_bottom_vec_.push_back(blob_bottom_permute_);
-    blob_top_vec_.push_back(blob_top_);
-  }
-  virtual ~BatchReindexLayerTest() {
-    delete blob_bottom_permute_;
-    delete blob_bottom_;
-    delete blob_top_;
-  }
-  Blob<Dtype>* const blob_bottom_;
-  Blob<Dtype>* const blob_bottom_permute_;
-  Blob<Dtype>* const blob_top_;
-  vector<Blob<Dtype>*> blob_bottom_vec_;
-  vector<Blob<Dtype>*> blob_top_vec_;
-
-  void TestForward() {
-    LayerParameter layer_param;
-
-    vector<int> sz;
-    sz.push_back(5);
-    sz.push_back(4);
-    sz.push_back(3);
-    sz.push_back(2);
-    blob_bottom_->Reshape(sz);
-    for (int i = 0; i < blob_bottom_->count(); ++i) {
-      blob_bottom_->mutable_cpu_data()[i] = i;
-    }
-
-    vector<int> permsz;
-    permsz.push_back(6);
-    blob_bottom_permute_->Reshape(permsz);
-    int perm[] = { 4, 0, 4, 0, 1, 2 };
-    for (int i = 0; i < blob_bottom_permute_->count(); ++i) {
-      blob_bottom_permute_->mutable_cpu_data()[i] = perm[i];
-    }
-    BatchReindexLayer<Dtype> layer(layer_param);
-    layer.SetUp(blob_bottom_vec_, blob_top_vec_);
-    EXPECT_EQ(blob_top_->num(), blob_bottom_permute_->num());
-    EXPECT_EQ(blob_top_->channels(), blob_bottom_->channels());
-    EXPECT_EQ(blob_top_->height(), blob_bottom_->height());
-    EXPECT_EQ(blob_top_->width(), blob_bottom_->width());
-
-    layer.Forward(blob_bottom_vec_, blob_top_vec_);
-    int channels = blob_top_->channels();
-    int height = blob_top_->height();
-    int width = blob_top_->width();
-    for (int i = 0; i < blob_top_->count(); ++i) {
-      int n = i / (channels * width * height);
-      int inner_idx = (i % (channels * width * height));
-      EXPECT_EQ(
-          blob_top_->cpu_data()[i],
-          blob_bottom_->cpu_data()[perm[n] * channels * width * height
-              + inner_idx]);
-    }
-  }
-};
-
-TYPED_TEST_CASE(BatchReindexLayerTest, TestDtypesAndDevices);
-
-TYPED_TEST(BatchReindexLayerTest, TestForward) {
-  this->TestForward();
-}
-
-TYPED_TEST(BatchReindexLayerTest, TestGradient) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  BatchReindexLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-4, 1e-2);
-  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_, 0);
-  }
-
-}  // namespace caffe
diff --git a/src/caffe/test/test_benchmark.cpp b/src/caffe/test/test_benchmark.cpp
deleted file mode 100644
index b03fdf6..0000000
--- a/src/caffe/test/test_benchmark.cpp
+++ /dev/null
@@ -1,90 +0,0 @@
-#include <boost/thread.hpp>
-
-#include "gtest/gtest.h"
-
-#include "caffe/common.hpp"
-#include "caffe/util/benchmark.hpp"
-
-#include "caffe/test/test_caffe_main.hpp"
-
-namespace caffe {
-
-const float kMillisecondsThreshold = 30;
-
-template <typename TypeParam>
-class BenchmarkTest : public MultiDeviceTest<TypeParam> {};
-
-TYPED_TEST_CASE(BenchmarkTest, TestDtypesAndDevices);
-
-TYPED_TEST(BenchmarkTest, TestTimerConstructor) {
-  Timer timer;
-  EXPECT_TRUE(timer.initted());
-  EXPECT_FALSE(timer.running());
-  EXPECT_FALSE(timer.has_run_at_least_once());
-}
-
-TYPED_TEST(BenchmarkTest, TestTimerStart) {
-  Timer timer;
-  timer.Start();
-  EXPECT_TRUE(timer.initted());
-  EXPECT_TRUE(timer.running());
-  EXPECT_TRUE(timer.has_run_at_least_once());
-  timer.Start();
-  EXPECT_TRUE(timer.initted());
-  EXPECT_TRUE(timer.running());
-  EXPECT_TRUE(timer.has_run_at_least_once());
-  timer.Stop();
-  timer.Start();
-  EXPECT_TRUE(timer.initted());
-  EXPECT_TRUE(timer.running());
-  EXPECT_TRUE(timer.has_run_at_least_once());
-}
-
-TYPED_TEST(BenchmarkTest, TestTimerStop) {
-  Timer timer;
-  timer.Stop();
-  EXPECT_TRUE(timer.initted());
-  EXPECT_FALSE(timer.running());
-  EXPECT_FALSE(timer.has_run_at_least_once());
-  timer.Start();
-  timer.Stop();
-  EXPECT_TRUE(timer.initted());
-  EXPECT_FALSE(timer.running());
-  EXPECT_TRUE(timer.has_run_at_least_once());
-  timer.Stop();
-  EXPECT_TRUE(timer.initted());
-  EXPECT_FALSE(timer.running());
-  EXPECT_TRUE(timer.has_run_at_least_once());
-}
-
-TYPED_TEST(BenchmarkTest, TestTimerMilliSeconds) {
-  Timer timer;
-  EXPECT_EQ(timer.MilliSeconds(), 0);
-  EXPECT_TRUE(timer.initted());
-  EXPECT_FALSE(timer.running());
-  EXPECT_FALSE(timer.has_run_at_least_once());
-  timer.Start();
-  boost::this_thread::sleep(boost::posix_time::milliseconds(300));
-  EXPECT_GE(timer.MilliSeconds(), 300 - kMillisecondsThreshold);
-  EXPECT_LE(timer.MilliSeconds(), 300 + kMillisecondsThreshold);
-  EXPECT_TRUE(timer.initted());
-  EXPECT_FALSE(timer.running());
-  EXPECT_TRUE(timer.has_run_at_least_once());
-}
-
-TYPED_TEST(BenchmarkTest, TestTimerSeconds) {
-  Timer timer;
-  EXPECT_EQ(timer.Seconds(), 0);
-  EXPECT_TRUE(timer.initted());
-  EXPECT_FALSE(timer.running());
-  EXPECT_FALSE(timer.has_run_at_least_once());
-  timer.Start();
-  boost::this_thread::sleep(boost::posix_time::milliseconds(300));
-  EXPECT_GE(timer.Seconds(), 0.3 - kMillisecondsThreshold / 1000.);
-  EXPECT_LE(timer.Seconds(), 0.3 + kMillisecondsThreshold / 1000.);
-  EXPECT_TRUE(timer.initted());
-  EXPECT_FALSE(timer.running());
-  EXPECT_TRUE(timer.has_run_at_least_once());
-}
-
-}  // namespace caffe
diff --git a/src/caffe/test/test_bias_layer.cpp b/src/caffe/test/test_bias_layer.cpp
deleted file mode 100644
index 3862e76..0000000
--- a/src/caffe/test/test_bias_layer.cpp
+++ /dev/null
@@ -1,467 +0,0 @@
-#include <algorithm>
-#include <vector>
-
-#include "gtest/gtest.h"
-
-#include "caffe/blob.hpp"
-#include "caffe/common.hpp"
-#include "caffe/filler.hpp"
-#include "caffe/layers/bias_layer.hpp"
-
-#include "caffe/test/test_caffe_main.hpp"
-#include "caffe/test/test_gradient_check_util.hpp"
-
-namespace caffe {
-
-template <typename TypeParam>
-class BiasLayerTest : public MultiDeviceTest<TypeParam> {
-  typedef typename TypeParam::Dtype Dtype;
-
- protected:
-  BiasLayerTest()
-      : blob_bottom_(new Blob<Dtype>(2, 3, 4, 5)),
-        blob_bottom_eltwise_(new Blob<Dtype>(2, 3, 4, 5)),
-        blob_bottom_broadcast_0_(new Blob<Dtype>()),
-        blob_bottom_broadcast_1_(new Blob<Dtype>()),
-        blob_bottom_broadcast_2_(new Blob<Dtype>()),
-        blob_bottom_bias_(new Blob<Dtype>(vector<int>())),
-        blob_top_(new Blob<Dtype>()) {
-    Caffe::set_random_seed(1701);
-    vector<int> broadcast_shape(2);
-    broadcast_shape[0] = 2; broadcast_shape[1] = 3;
-    this->blob_bottom_broadcast_0_->Reshape(broadcast_shape);
-    broadcast_shape[0] = 3; broadcast_shape[1] = 4;
-    this->blob_bottom_broadcast_1_->Reshape(broadcast_shape);
-    broadcast_shape[0] = 4; broadcast_shape[1] = 5;
-    this->blob_bottom_broadcast_2_->Reshape(broadcast_shape);
-    FillerParameter filler_param;
-    filler_param.set_min(1);
-    filler_param.set_max(10);
-    UniformFiller<Dtype> filler(filler_param);
-    filler.Fill(this->blob_bottom_);
-    filler.Fill(this->blob_bottom_eltwise_);
-    filler.Fill(this->blob_bottom_broadcast_0_);
-    filler.Fill(this->blob_bottom_broadcast_1_);
-    filler.Fill(this->blob_bottom_broadcast_2_);
-    filler.Fill(this->blob_bottom_bias_);
-    blob_bottom_vec_.push_back(blob_bottom_);
-    blob_top_vec_.push_back(blob_top_);
-  }
-  virtual ~BiasLayerTest() {
-    delete blob_bottom_;
-    delete blob_bottom_eltwise_;
-    delete blob_bottom_broadcast_0_;
-    delete blob_bottom_broadcast_1_;
-    delete blob_bottom_broadcast_2_;
-    delete blob_bottom_bias_;
-    delete blob_top_;
-  }
-  Blob<Dtype>* const blob_bottom_;
-  Blob<Dtype>* const blob_bottom_eltwise_;
-  Blob<Dtype>* const blob_bottom_broadcast_0_;
-  Blob<Dtype>* const blob_bottom_broadcast_1_;
-  Blob<Dtype>* const blob_bottom_broadcast_2_;
-  Blob<Dtype>* const blob_bottom_bias_;
-  Blob<Dtype>* const blob_top_;
-  vector<Blob<Dtype>*> blob_bottom_vec_;
-  vector<Blob<Dtype>*> blob_top_vec_;
-};
-
-TYPED_TEST_CASE(BiasLayerTest, TestDtypesAndDevices);
-
-TYPED_TEST(BiasLayerTest, TestForwardEltwise) {
-  typedef typename TypeParam::Dtype Dtype;
-  this->blob_bottom_vec_.push_back(this->blob_bottom_eltwise_);
-  LayerParameter layer_param;
-  layer_param.mutable_bias_param()->set_axis(0);
-  shared_ptr<BiasLayer<Dtype> > layer(new BiasLayer<Dtype>(layer_param));
-  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  ASSERT_EQ(this->blob_bottom_->shape(), this->blob_top_->shape());
-  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  const Dtype* data = this->blob_top_->cpu_data();
-  const int count = this->blob_top_->count();
-  const Dtype* in_data_a = this->blob_bottom_->cpu_data();
-  const Dtype* in_data_b = this->blob_bottom_eltwise_->cpu_data();
-  for (int i = 0; i < count; ++i) {
-    EXPECT_NEAR(data[i], in_data_a[i] + in_data_b[i], 1e-5);
-  }
-}
-
-TYPED_TEST(BiasLayerTest, TestForwardEltwiseInPlace) {
-  typedef typename TypeParam::Dtype Dtype;
-  this->blob_top_vec_[0] = this->blob_bottom_;  // in-place computation
-  Blob<Dtype> orig_bottom(this->blob_bottom_->shape());
-  orig_bottom.CopyFrom(*this->blob_bottom_);
-  this->blob_bottom_vec_.push_back(this->blob_bottom_eltwise_);
-  LayerParameter layer_param;
-  layer_param.mutable_bias_param()->set_axis(0);
-  shared_ptr<BiasLayer<Dtype> > layer(new BiasLayer<Dtype>(layer_param));
-  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  const Dtype* data = this->blob_bottom_->cpu_data();
-  const int count = this->blob_bottom_->count();
-  const Dtype* in_data_a = orig_bottom.cpu_data();
-  const Dtype* in_data_b = this->blob_bottom_eltwise_->cpu_data();
-  for (int i = 0; i < count; ++i) {
-    EXPECT_NEAR(data[i], in_data_a[i] + in_data_b[i], 1e-5);
-  }
-}
-
-TYPED_TEST(BiasLayerTest, TestBackwardEltwiseInPlace) {
-  typedef typename TypeParam::Dtype Dtype;
-  Blob<Dtype> orig_bottom(this->blob_bottom_->shape());
-  orig_bottom.CopyFrom(*this->blob_bottom_);
-  this->blob_bottom_vec_.push_back(this->blob_bottom_eltwise_);
-  LayerParameter layer_param;
-  layer_param.mutable_bias_param()->set_axis(0);
-  shared_ptr<BiasLayer<Dtype> > layer(new BiasLayer<Dtype>(layer_param));
-  Blob<Dtype> top_diff(this->blob_bottom_->shape());
-  FillerParameter filler_param;
-  filler_param.set_type("gaussian");
-  filler_param.set_std(1);
-  GaussianFiller<Dtype> filler(filler_param);
-  filler.Fill(&top_diff);
-  vector<bool> propagate_down(2, true);
-  // Run forward + backward without in-place computation;
-  // save resulting bottom diffs.
-  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  caffe_copy(top_diff.count(), top_diff.cpu_data(),
-             this->blob_top_->mutable_cpu_diff());
-  layer->Backward(this->blob_top_vec_, propagate_down, this->blob_bottom_vec_);
-  const bool kReshape = true;
-  const bool kCopyDiff = true;
-  Blob<Dtype> orig_bottom_diff;
-  orig_bottom_diff.CopyFrom(*this->blob_bottom_, kCopyDiff, kReshape);
-  Blob<Dtype> orig_bias_diff;
-  orig_bias_diff.CopyFrom(*this->blob_bottom_eltwise_,
-                            kCopyDiff, kReshape);
-  // Rerun forward + backward with in-place computation;
-  // check that resulting bottom diffs are the same.
-  this->blob_top_vec_[0] = this->blob_bottom_;  // in-place computation
-  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  caffe_copy(top_diff.count(), top_diff.cpu_data(),
-             this->blob_bottom_->mutable_cpu_diff());
-  layer->Backward(this->blob_top_vec_, propagate_down, this->blob_bottom_vec_);
-  for (int i = 0; i < this->blob_bottom_->count(); ++i) {
-    EXPECT_NEAR(orig_bottom_diff.cpu_diff()[i],
-                this->blob_bottom_->cpu_diff()[i], 1e-5);
-  }
-  for (int i = 0; i < this->blob_bottom_eltwise_->count(); ++i) {
-    EXPECT_NEAR(orig_bias_diff.cpu_diff()[i],
-                this->blob_bottom_eltwise_->cpu_diff()[i], 1e-5);
-  }
-}
-
-TYPED_TEST(BiasLayerTest, TestForwardEltwiseWithParam) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  BiasParameter* bias_param = layer_param.mutable_bias_param();
-  bias_param->set_axis(0);
-  bias_param->set_num_axes(-1);
-  bias_param->mutable_filler()->set_type("gaussian");
-  shared_ptr<BiasLayer<Dtype> > layer(new BiasLayer<Dtype>(layer_param));
-  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  ASSERT_EQ(this->blob_bottom_->shape(), this->blob_top_->shape());
-  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  const Dtype* data = this->blob_top_->cpu_data();
-  const int count = this->blob_top_->count();
-  const Dtype* in_data_a = this->blob_bottom_->cpu_data();
-  const Dtype* in_data_b = layer->blobs()[0]->cpu_data();
-  for (int i = 0; i < count; ++i) {
-    EXPECT_NEAR(data[i], in_data_a[i] + in_data_b[i], 1e-5);
-  }
-}
-
-TYPED_TEST(BiasLayerTest, TestForwardBroadcastBegin) {
-  typedef typename TypeParam::Dtype Dtype;
-  this->blob_bottom_vec_.push_back(this->blob_bottom_broadcast_0_);
-  LayerParameter layer_param;
-  layer_param.mutable_bias_param()->set_axis(0);
-  shared_ptr<BiasLayer<Dtype> > layer(new BiasLayer<Dtype>(layer_param));
-  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  ASSERT_EQ(this->blob_bottom_->shape(), this->blob_top_->shape());
-  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  for (int n = 0; n < this->blob_bottom_->num(); ++n) {
-    for (int c = 0; c < this->blob_bottom_->channels(); ++c) {
-      for (int h = 0; h < this->blob_bottom_->height(); ++h) {
-        for (int w = 0; w < this->blob_bottom_->width(); ++w) {
-          EXPECT_NEAR(this->blob_top_->data_at(n, c, h, w),
-                      this->blob_bottom_->data_at(n, c, h, w) +
-                      this->blob_bottom_broadcast_0_->data_at(n, c, 0, 0),
-                      1e-5);
-        }
-      }
-    }
-  }
-}
-
-TYPED_TEST(BiasLayerTest, TestForwardBroadcastMiddle) {
-  typedef typename TypeParam::Dtype Dtype;
-  this->blob_bottom_vec_.push_back(this->blob_bottom_broadcast_1_);
-  LayerParameter layer_param;
-  layer_param.mutable_bias_param()->set_axis(1);
-  shared_ptr<BiasLayer<Dtype> > layer(new BiasLayer<Dtype>(layer_param));
-  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  ASSERT_EQ(this->blob_bottom_->shape(), this->blob_top_->shape());
-  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  for (int n = 0; n < this->blob_bottom_->num(); ++n) {
-    for (int c = 0; c < this->blob_bottom_->channels(); ++c) {
-      for (int h = 0; h < this->blob_bottom_->height(); ++h) {
-        for (int w = 0; w < this->blob_bottom_->width(); ++w) {
-          EXPECT_NEAR(this->blob_top_->data_at(n, c, h, w),
-                      this->blob_bottom_->data_at(n, c, h, w) +
-                      this->blob_bottom_broadcast_1_->data_at(c, h, 0, 0),
-                      1e-5);
-        }
-      }
-    }
-  }
-}
-
-TYPED_TEST(BiasLayerTest, TestForwardBroadcastMiddleInPlace) {
-  typedef typename TypeParam::Dtype Dtype;
-  this->blob_top_vec_[0] = this->blob_bottom_;  // in-place computation
-  Blob<Dtype> orig_bottom(this->blob_bottom_->shape());
-  orig_bottom.CopyFrom(*this->blob_bottom_);
-  this->blob_bottom_vec_.push_back(this->blob_bottom_broadcast_1_);
-  LayerParameter layer_param;
-  layer_param.mutable_bias_param()->set_axis(1);
-  shared_ptr<BiasLayer<Dtype> > layer(new BiasLayer<Dtype>(layer_param));
-  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  for (int n = 0; n < this->blob_bottom_->num(); ++n) {
-    for (int c = 0; c < this->blob_bottom_->channels(); ++c) {
-      for (int h = 0; h < this->blob_bottom_->height(); ++h) {
-        for (int w = 0; w < this->blob_bottom_->width(); ++w) {
-          EXPECT_NEAR(this->blob_bottom_->data_at(n, c, h, w),
-                      orig_bottom.data_at(n, c, h, w) +
-                      this->blob_bottom_broadcast_1_->data_at(c, h, 0, 0),
-                      1e-5);
-        }
-      }
-    }
-  }
-}
-
-TYPED_TEST(BiasLayerTest, TestBackwardBroadcastMiddleInPlace) {
-  typedef typename TypeParam::Dtype Dtype;
-  Blob<Dtype> orig_bottom(this->blob_bottom_->shape());
-  orig_bottom.CopyFrom(*this->blob_bottom_);
-  this->blob_bottom_vec_.push_back(this->blob_bottom_broadcast_1_);
-  LayerParameter layer_param;
-  layer_param.mutable_bias_param()->set_axis(1);
-  shared_ptr<BiasLayer<Dtype> > layer(new BiasLayer<Dtype>(layer_param));
-  Blob<Dtype> top_diff(this->blob_bottom_->shape());
-  FillerParameter filler_param;
-  filler_param.set_type("gaussian");
-  filler_param.set_std(1);
-  GaussianFiller<Dtype> filler(filler_param);
-  filler.Fill(&top_diff);
-  vector<bool> propagate_down(2, true);
-  // Run forward + backward without in-place computation;
-  // save resulting bottom diffs.
-  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  caffe_copy(top_diff.count(), top_diff.cpu_data(),
-             this->blob_top_->mutable_cpu_diff());
-  layer->Backward(this->blob_top_vec_, propagate_down, this->blob_bottom_vec_);
-  const bool kReshape = true;
-  const bool kCopyDiff = true;
-  Blob<Dtype> orig_bottom_diff;
-  orig_bottom_diff.CopyFrom(*this->blob_bottom_, kCopyDiff, kReshape);
-  Blob<Dtype> orig_bias_diff;
-  orig_bias_diff.CopyFrom(*this->blob_bottom_broadcast_1_,
-                            kCopyDiff, kReshape);
-  // Rerun forward + backward with in-place computation;
-  // check that resulting bottom diffs are the same.
-  this->blob_top_vec_[0] = this->blob_bottom_;  // in-place computation
-  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  caffe_copy(top_diff.count(), top_diff.cpu_data(),
-             this->blob_bottom_->mutable_cpu_diff());
-  layer->Backward(this->blob_top_vec_, propagate_down, this->blob_bottom_vec_);
-  for (int i = 0; i < this->blob_bottom_->count(); ++i) {
-    EXPECT_NEAR(orig_bottom_diff.cpu_diff()[i],
-                this->blob_bottom_->cpu_diff()[i], 1e-5);
-  }
-  for (int i = 0; i < this->blob_bottom_broadcast_1_->count(); ++i) {
-    EXPECT_NEAR(orig_bias_diff.cpu_diff()[i],
-                this->blob_bottom_broadcast_1_->cpu_diff()[i], 1e-5);
-  }
-}
-
-TYPED_TEST(BiasLayerTest, TestForwardBroadcastMiddleWithParam) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  BiasParameter* bias_param = layer_param.mutable_bias_param();
-  bias_param->set_axis(1);
-  bias_param->set_num_axes(2);
-  bias_param->mutable_filler()->set_type("gaussian");
-  shared_ptr<BiasLayer<Dtype> > layer(new BiasLayer<Dtype>(layer_param));
-  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  ASSERT_EQ(this->blob_bottom_->shape(), this->blob_top_->shape());
-  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  for (int n = 0; n < this->blob_bottom_->num(); ++n) {
-    for (int c = 0; c < this->blob_bottom_->channels(); ++c) {
-      for (int h = 0; h < this->blob_bottom_->height(); ++h) {
-        for (int w = 0; w < this->blob_bottom_->width(); ++w) {
-          EXPECT_NEAR(this->blob_top_->data_at(n, c, h, w),
-                      this->blob_bottom_->data_at(n, c, h, w) +
-                      layer->blobs()[0]->data_at(c, h, 0, 0), 1e-5);
-        }
-      }
-    }
-  }
-}
-
-TYPED_TEST(BiasLayerTest, TestForwardBroadcastEnd) {
-  typedef typename TypeParam::Dtype Dtype;
-  this->blob_bottom_vec_.push_back(this->blob_bottom_broadcast_2_);
-  LayerParameter layer_param;
-  layer_param.mutable_bias_param()->set_axis(2);
-  shared_ptr<BiasLayer<Dtype> > layer(new BiasLayer<Dtype>(layer_param));
-  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  ASSERT_EQ(this->blob_bottom_->shape(), this->blob_top_->shape());
-  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  for (int n = 0; n < this->blob_bottom_->num(); ++n) {
-    for (int c = 0; c < this->blob_bottom_->channels(); ++c) {
-      for (int h = 0; h < this->blob_bottom_->height(); ++h) {
-        for (int w = 0; w < this->blob_bottom_->width(); ++w) {
-          EXPECT_NEAR(this->blob_top_->data_at(n, c, h, w),
-                      this->blob_bottom_->data_at(n, c, h, w) +
-                      this->blob_bottom_broadcast_2_->data_at(h, w, 0, 0),
-                      1e-5);
-        }
-      }
-    }
-  }
-}
-
-TYPED_TEST(BiasLayerTest, TestForwardBias) {
-  typedef typename TypeParam::Dtype Dtype;
-  this->blob_bottom_vec_.push_back(this->blob_bottom_bias_);
-  LayerParameter layer_param;
-  shared_ptr<BiasLayer<Dtype> > layer(new BiasLayer<Dtype>(layer_param));
-  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  ASSERT_EQ(this->blob_bottom_->shape(), this->blob_top_->shape());
-  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  const Dtype* data = this->blob_top_->cpu_data();
-  const int count = this->blob_top_->count();
-  const Dtype* in_data = this->blob_bottom_->cpu_data();
-  const Dtype bias = *this->blob_bottom_bias_->cpu_data();
-  for (int i = 0; i < count; ++i) {
-    EXPECT_NEAR(data[i], in_data[i] + bias, 1e-5);
-  }
-}
-
-TYPED_TEST(BiasLayerTest, TestForwardBiasAxis2) {
-  typedef typename TypeParam::Dtype Dtype;
-  this->blob_bottom_vec_.push_back(this->blob_bottom_bias_);
-  LayerParameter layer_param;
-  layer_param.mutable_bias_param()->set_axis(2);
-  shared_ptr<BiasLayer<Dtype> > layer(new BiasLayer<Dtype>(layer_param));
-  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  ASSERT_EQ(this->blob_bottom_->shape(), this->blob_top_->shape());
-  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  const Dtype* data = this->blob_top_->cpu_data();
-  const int count = this->blob_top_->count();
-  const Dtype* in_data = this->blob_bottom_->cpu_data();
-  const Dtype bias = *this->blob_bottom_bias_->cpu_data();
-  for (int i = 0; i < count; ++i) {
-    EXPECT_NEAR(data[i], in_data[i] + bias, 1e-5);
-  }
-}
-
-TYPED_TEST(BiasLayerTest, TestGradientEltwise) {
-  typedef typename TypeParam::Dtype Dtype;
-  this->blob_bottom_vec_.push_back(this->blob_bottom_eltwise_);
-  LayerParameter layer_param;
-  layer_param.mutable_bias_param()->set_axis(0);
-  BiasLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-2, 1e-3);
-  checker.CheckGradientEltwise(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_);
-}
-
-TYPED_TEST(BiasLayerTest, TestGradientEltwiseWithParam) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  BiasParameter* bias_param = layer_param.mutable_bias_param();
-  bias_param->set_axis(0);
-  bias_param->set_num_axes(-1);
-  bias_param->mutable_filler()->set_type("gaussian");
-  BiasLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-2, 1e-3);
-  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_);
-}
-
-TYPED_TEST(BiasLayerTest, TestGradientBroadcastBegin) {
-  typedef typename TypeParam::Dtype Dtype;
-  this->blob_bottom_vec_.push_back(this->blob_bottom_broadcast_0_);
-  LayerParameter layer_param;
-  layer_param.mutable_bias_param()->set_axis(0);
-  BiasLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-2, 1e-3);
-  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_);
-}
-
-TYPED_TEST(BiasLayerTest, TestGradientBroadcastMiddle) {
-  typedef typename TypeParam::Dtype Dtype;
-  this->blob_bottom_vec_.push_back(this->blob_bottom_broadcast_1_);
-  LayerParameter layer_param;
-  layer_param.mutable_bias_param()->set_axis(1);
-  BiasLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-2, 1e-3);
-  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_);
-}
-
-TYPED_TEST(BiasLayerTest, TestGradientBroadcastMiddleWithParam) {
-  typedef typename TypeParam::Dtype Dtype;
-  this->blob_bottom_vec_.push_back(this->blob_bottom_broadcast_1_);
-  LayerParameter layer_param;
-  BiasParameter* bias_param = layer_param.mutable_bias_param();
-  bias_param->set_axis(1);
-  bias_param->set_num_axes(2);
-  bias_param->mutable_filler()->set_type("gaussian");
-  BiasLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-2, 1e-3);
-  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_);
-}
-
-TYPED_TEST(BiasLayerTest, TestGradientBroadcastEnd) {
-  typedef typename TypeParam::Dtype Dtype;
-  this->blob_bottom_vec_.push_back(this->blob_bottom_broadcast_2_);
-  LayerParameter layer_param;
-  layer_param.mutable_bias_param()->set_axis(2);
-  BiasLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-2, 1e-3);
-  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_);
-}
-
-TYPED_TEST(BiasLayerTest, TestGradientBias) {
-  typedef typename TypeParam::Dtype Dtype;
-  this->blob_bottom_vec_.push_back(this->blob_bottom_bias_);
-  LayerParameter layer_param;
-  BiasLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-2, 1e-3);
-  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_);
-}
-
-TYPED_TEST(BiasLayerTest, TestGradientBiasAxis2) {
-  typedef typename TypeParam::Dtype Dtype;
-  this->blob_bottom_vec_.push_back(this->blob_bottom_bias_);
-  LayerParameter layer_param;
-  layer_param.mutable_bias_param()->set_axis(2);
-  BiasLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-2, 1e-3);
-  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_);
-}
-
-}  // namespace caffe
diff --git a/src/caffe/test/test_blob.cpp b/src/caffe/test/test_blob.cpp
deleted file mode 100644
index b885622..0000000
--- a/src/caffe/test/test_blob.cpp
+++ /dev/null
@@ -1,301 +0,0 @@
-#include <vector>
-
-#include "gtest/gtest.h"
-
-#include "caffe/blob.hpp"
-#include "caffe/common.hpp"
-#include "caffe/filler.hpp"
-
-#include "caffe/test/test_caffe_main.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-class BlobSimpleTest : public ::testing::Test {
- protected:
-  BlobSimpleTest()
-      : blob_(new Blob<Dtype>()),
-        blob_preshaped_(new Blob<Dtype>(2, 3, 4, 5)) {}
-  virtual ~BlobSimpleTest() { delete blob_; delete blob_preshaped_; }
-  Blob<Dtype>* const blob_;
-  Blob<Dtype>* const blob_preshaped_;
-};
-
-TYPED_TEST_CASE(BlobSimpleTest, TestDtypes);
-
-TYPED_TEST(BlobSimpleTest, TestInitialization) {
-  EXPECT_TRUE(this->blob_);
-  EXPECT_TRUE(this->blob_preshaped_);
-  EXPECT_EQ(this->blob_preshaped_->num(), 2);
-  EXPECT_EQ(this->blob_preshaped_->channels(), 3);
-  EXPECT_EQ(this->blob_preshaped_->height(), 4);
-  EXPECT_EQ(this->blob_preshaped_->width(), 5);
-  EXPECT_EQ(this->blob_preshaped_->count(), 120);
-  EXPECT_EQ(this->blob_->num_axes(), 0);
-  EXPECT_EQ(this->blob_->count(), 0);
-}
-
-TYPED_TEST(BlobSimpleTest, TestPointersCPUGPU) {
-  EXPECT_TRUE(this->blob_preshaped_->gpu_data());
-  EXPECT_TRUE(this->blob_preshaped_->cpu_data());
-  EXPECT_TRUE(this->blob_preshaped_->mutable_gpu_data());
-  EXPECT_TRUE(this->blob_preshaped_->mutable_cpu_data());
-}
-
-TYPED_TEST(BlobSimpleTest, TestReshape) {
-  this->blob_->Reshape(2, 3, 4, 5);
-  EXPECT_EQ(this->blob_->num(), 2);
-  EXPECT_EQ(this->blob_->channels(), 3);
-  EXPECT_EQ(this->blob_->height(), 4);
-  EXPECT_EQ(this->blob_->width(), 5);
-  EXPECT_EQ(this->blob_->count(), 120);
-}
-
-TYPED_TEST(BlobSimpleTest, TestReshapeZero) {
-  vector<int> shape(2);
-  shape[0] = 0;
-  shape[1] = 5;
-  this->blob_->Reshape(shape);
-  EXPECT_EQ(this->blob_->count(), 0);
-}
-
-TYPED_TEST(BlobSimpleTest, TestLegacyBlobProtoShapeEquals) {
-  BlobProto blob_proto;
-
-  // Reshape to (3 x 2).
-  vector<int> shape(2);
-  shape[0] = 3;
-  shape[1] = 2;
-  this->blob_->Reshape(shape);
-
-  // (3 x 2) blob == (1 x 1 x 3 x 2) legacy blob
-  blob_proto.set_num(1);
-  blob_proto.set_channels(1);
-  blob_proto.set_height(3);
-  blob_proto.set_width(2);
-  EXPECT_TRUE(this->blob_->ShapeEquals(blob_proto));
-
-  // (3 x 2) blob != (0 x 1 x 3 x 2) legacy blob
-  blob_proto.set_num(0);
-  blob_proto.set_channels(1);
-  blob_proto.set_height(3);
-  blob_proto.set_width(2);
-  EXPECT_FALSE(this->blob_->ShapeEquals(blob_proto));
-
-  // (3 x 2) blob != (3 x 1 x 3 x 2) legacy blob
-  blob_proto.set_num(3);
-  blob_proto.set_channels(1);
-  blob_proto.set_height(3);
-  blob_proto.set_width(2);
-  EXPECT_FALSE(this->blob_->ShapeEquals(blob_proto));
-
-  // Reshape to (1 x 3 x 2).
-  shape.insert(shape.begin(), 1);
-  this->blob_->Reshape(shape);
-
-  // (1 x 3 x 2) blob == (1 x 1 x 3 x 2) legacy blob
-  blob_proto.set_num(1);
-  blob_proto.set_channels(1);
-  blob_proto.set_height(3);
-  blob_proto.set_width(2);
-  EXPECT_TRUE(this->blob_->ShapeEquals(blob_proto));
-
-  // Reshape to (2 x 3 x 2).
-  shape[0] = 2;
-  this->blob_->Reshape(shape);
-
-  // (2 x 3 x 2) blob != (1 x 1 x 3 x 2) legacy blob
-  blob_proto.set_num(1);
-  blob_proto.set_channels(1);
-  blob_proto.set_height(3);
-  blob_proto.set_width(2);
-  EXPECT_FALSE(this->blob_->ShapeEquals(blob_proto));
-}
-
-template <typename TypeParam>
-class BlobMathTest : public MultiDeviceTest<TypeParam> {
-  typedef typename TypeParam::Dtype Dtype;
- protected:
-  BlobMathTest()
-      : blob_(new Blob<Dtype>(2, 3, 4, 5)),
-        epsilon_(1e-6) {}
-
-  virtual ~BlobMathTest() { delete blob_; }
-  Blob<Dtype>* const blob_;
-  Dtype epsilon_;
-};
-
-TYPED_TEST_CASE(BlobMathTest, TestDtypesAndDevices);
-
-TYPED_TEST(BlobMathTest, TestSumOfSquares) {
-  typedef typename TypeParam::Dtype Dtype;
-
-  // Uninitialized Blob should have sum of squares == 0.
-  EXPECT_EQ(0, this->blob_->sumsq_data());
-  EXPECT_EQ(0, this->blob_->sumsq_diff());
-  FillerParameter filler_param;
-  filler_param.set_min(-3);
-  filler_param.set_max(3);
-  UniformFiller<Dtype> filler(filler_param);
-  filler.Fill(this->blob_);
-  Dtype expected_sumsq = 0;
-  const Dtype* data = this->blob_->cpu_data();
-  for (int i = 0; i < this->blob_->count(); ++i) {
-    expected_sumsq += data[i] * data[i];
-  }
-  // Do a mutable access on the current device,
-  // so that the sumsq computation is done on that device.
-  // (Otherwise, this would only check the CPU sumsq implementation.)
-  switch (TypeParam::device) {
-  case Caffe::CPU:
-    this->blob_->mutable_cpu_data();
-    break;
-  case Caffe::GPU:
-    this->blob_->mutable_gpu_data();
-    break;
-  default:
-    LOG(FATAL) << "Unknown device: " << TypeParam::device;
-  }
-  EXPECT_NEAR(expected_sumsq, this->blob_->sumsq_data(),
-              this->epsilon_ * expected_sumsq);
-  EXPECT_EQ(0, this->blob_->sumsq_diff());
-
-  // Check sumsq_diff too.
-  const Dtype kDiffScaleFactor = 7;
-  caffe_cpu_scale(this->blob_->count(), kDiffScaleFactor, data,
-                  this->blob_->mutable_cpu_diff());
-  switch (TypeParam::device) {
-  case Caffe::CPU:
-    this->blob_->mutable_cpu_diff();
-    break;
-  case Caffe::GPU:
-    this->blob_->mutable_gpu_diff();
-    break;
-  default:
-    LOG(FATAL) << "Unknown device: " << TypeParam::device;
-  }
-  EXPECT_NEAR(expected_sumsq, this->blob_->sumsq_data(),
-              this->epsilon_ * expected_sumsq);
-  const Dtype expected_sumsq_diff =
-      expected_sumsq * kDiffScaleFactor * kDiffScaleFactor;
-  EXPECT_NEAR(expected_sumsq_diff, this->blob_->sumsq_diff(),
-              this->epsilon_ * expected_sumsq_diff);
-}
-
-TYPED_TEST(BlobMathTest, TestAsum) {
-  typedef typename TypeParam::Dtype Dtype;
-
-  // Uninitialized Blob should have asum == 0.
-  EXPECT_EQ(0, this->blob_->asum_data());
-  EXPECT_EQ(0, this->blob_->asum_diff());
-  FillerParameter filler_param;
-  filler_param.set_min(-3);
-  filler_param.set_max(3);
-  UniformFiller<Dtype> filler(filler_param);
-  filler.Fill(this->blob_);
-  Dtype expected_asum = 0;
-  const Dtype* data = this->blob_->cpu_data();
-  for (int i = 0; i < this->blob_->count(); ++i) {
-    expected_asum += std::fabs(data[i]);
-  }
-  // Do a mutable access on the current device,
-  // so that the asum computation is done on that device.
-  // (Otherwise, this would only check the CPU asum implementation.)
-  switch (TypeParam::device) {
-  case Caffe::CPU:
-    this->blob_->mutable_cpu_data();
-    break;
-  case Caffe::GPU:
-    this->blob_->mutable_gpu_data();
-    break;
-  default:
-    LOG(FATAL) << "Unknown device: " << TypeParam::device;
-  }
-  EXPECT_NEAR(expected_asum, this->blob_->asum_data(),
-              this->epsilon_ * expected_asum);
-  EXPECT_EQ(0, this->blob_->asum_diff());
-
-  // Check asum_diff too.
-  const Dtype kDiffScaleFactor = 7;
-  caffe_cpu_scale(this->blob_->count(), kDiffScaleFactor, data,
-                  this->blob_->mutable_cpu_diff());
-  switch (TypeParam::device) {
-  case Caffe::CPU:
-    this->blob_->mutable_cpu_diff();
-    break;
-  case Caffe::GPU:
-    this->blob_->mutable_gpu_diff();
-    break;
-  default:
-    LOG(FATAL) << "Unknown device: " << TypeParam::device;
-  }
-  EXPECT_NEAR(expected_asum, this->blob_->asum_data(),
-              this->epsilon_ * expected_asum);
-  const Dtype expected_diff_asum = expected_asum * kDiffScaleFactor;
-  EXPECT_NEAR(expected_diff_asum, this->blob_->asum_diff(),
-              this->epsilon_ * expected_diff_asum);
-}
-
-TYPED_TEST(BlobMathTest, TestScaleData) {
-  typedef typename TypeParam::Dtype Dtype;
-
-  EXPECT_EQ(0, this->blob_->asum_data());
-  EXPECT_EQ(0, this->blob_->asum_diff());
-  FillerParameter filler_param;
-  filler_param.set_min(-3);
-  filler_param.set_max(3);
-  UniformFiller<Dtype> filler(filler_param);
-  filler.Fill(this->blob_);
-  const Dtype asum_before_scale = this->blob_->asum_data();
-  // Do a mutable access on the current device,
-  // so that the asum computation is done on that device.
-  // (Otherwise, this would only check the CPU asum implementation.)
-  switch (TypeParam::device) {
-  case Caffe::CPU:
-    this->blob_->mutable_cpu_data();
-    break;
-  case Caffe::GPU:
-    this->blob_->mutable_gpu_data();
-    break;
-  default:
-    LOG(FATAL) << "Unknown device: " << TypeParam::device;
-  }
-  const Dtype kDataScaleFactor = 3;
-  this->blob_->scale_data(kDataScaleFactor);
-  EXPECT_NEAR(asum_before_scale * kDataScaleFactor, this->blob_->asum_data(),
-              this->epsilon_ * asum_before_scale * kDataScaleFactor);
-  EXPECT_EQ(0, this->blob_->asum_diff());
-
-  // Check scale_diff too.
-  const Dtype kDataToDiffScaleFactor = 7;
-  const Dtype* data = this->blob_->cpu_data();
-  caffe_cpu_scale(this->blob_->count(), kDataToDiffScaleFactor, data,
-                  this->blob_->mutable_cpu_diff());
-  const Dtype expected_asum_before_scale = asum_before_scale * kDataScaleFactor;
-  EXPECT_NEAR(expected_asum_before_scale, this->blob_->asum_data(),
-              this->epsilon_ * expected_asum_before_scale);
-  const Dtype expected_diff_asum_before_scale =
-      asum_before_scale * kDataScaleFactor * kDataToDiffScaleFactor;
-  EXPECT_NEAR(expected_diff_asum_before_scale, this->blob_->asum_diff(),
-              this->epsilon_ * expected_diff_asum_before_scale);
-  switch (TypeParam::device) {
-  case Caffe::CPU:
-    this->blob_->mutable_cpu_diff();
-    break;
-  case Caffe::GPU:
-    this->blob_->mutable_gpu_diff();
-    break;
-  default:
-    LOG(FATAL) << "Unknown device: " << TypeParam::device;
-  }
-  const Dtype kDiffScaleFactor = 3;
-  this->blob_->scale_diff(kDiffScaleFactor);
-  EXPECT_NEAR(asum_before_scale * kDataScaleFactor, this->blob_->asum_data(),
-              this->epsilon_ * asum_before_scale * kDataScaleFactor);
-  const Dtype expected_diff_asum =
-      expected_diff_asum_before_scale * kDiffScaleFactor;
-  EXPECT_NEAR(expected_diff_asum, this->blob_->asum_diff(),
-              this->epsilon_ * expected_diff_asum);
-}
-
-}  // namespace caffe
diff --git a/src/caffe/test/test_caffe_main.cpp b/src/caffe/test/test_caffe_main.cpp
deleted file mode 100644
index 6473b74..0000000
--- a/src/caffe/test/test_caffe_main.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-#include "caffe/caffe.hpp"
-#include "caffe/test/test_caffe_main.hpp"
-
-namespace caffe {
-#ifndef CPU_ONLY
-  cudaDeviceProp CAFFE_TEST_CUDA_PROP;
-#endif
-}
-
-#ifndef CPU_ONLY
-using caffe::CAFFE_TEST_CUDA_PROP;
-#endif
-
-int main(int argc, char** argv) {
-  ::testing::InitGoogleTest(&argc, argv);
-  caffe::GlobalInit(&argc, &argv);
-#ifndef CPU_ONLY
-  // Before starting testing, let's first print out a few cuda defice info.
-  int device;
-  cudaGetDeviceCount(&device);
-  cout << "Cuda number of devices: " << device << endl;
-  if (argc > 1) {
-    // Use the given device
-    device = atoi(argv[1]);
-    cudaSetDevice(device);
-    cout << "Setting to use device " << device << endl;
-  } else if (CUDA_TEST_DEVICE >= 0) {
-    // Use the device assigned in build configuration; but with a lower priority
-    device = CUDA_TEST_DEVICE;
-  }
-  cudaGetDevice(&device);
-  cout << "Current device id: " << device << endl;
-  cudaGetDeviceProperties(&CAFFE_TEST_CUDA_PROP, device);
-  cout << "Current device name: " << CAFFE_TEST_CUDA_PROP.name << endl;
-#endif
-  // invoke the test.
-  return RUN_ALL_TESTS();
-}
diff --git a/src/caffe/test/test_common.cpp b/src/caffe/test/test_common.cpp
deleted file mode 100644
index 58ae5c6..0000000
--- a/src/caffe/test/test_common.cpp
+++ /dev/null
@@ -1,64 +0,0 @@
-#include "gtest/gtest.h"
-
-#include "caffe/common.hpp"
-#include "caffe/syncedmem.hpp"
-#include "caffe/util/math_functions.hpp"
-
-#include "caffe/test/test_caffe_main.hpp"
-
-namespace caffe {
-
-class CommonTest : public ::testing::Test {};
-
-#ifndef CPU_ONLY  // GPU Caffe singleton test.
-
-TEST_F(CommonTest, TestCublasHandlerGPU) {
-  int cuda_device_id;
-  CUDA_CHECK(cudaGetDevice(&cuda_device_id));
-  EXPECT_TRUE(Caffe::cublas_handle());
-}
-
-#endif
-
-TEST_F(CommonTest, TestBrewMode) {
-  Caffe::set_mode(Caffe::CPU);
-  EXPECT_EQ(Caffe::mode(), Caffe::CPU);
-  Caffe::set_mode(Caffe::GPU);
-  EXPECT_EQ(Caffe::mode(), Caffe::GPU);
-}
-
-TEST_F(CommonTest, TestRandSeedCPU) {
-  SyncedMemory data_a(10 * sizeof(int));
-  SyncedMemory data_b(10 * sizeof(int));
-  Caffe::set_random_seed(1701);
-  caffe_rng_bernoulli(10, 0.5, static_cast<int*>(data_a.mutable_cpu_data()));
-
-  Caffe::set_random_seed(1701);
-  caffe_rng_bernoulli(10, 0.5, static_cast<int*>(data_b.mutable_cpu_data()));
-
-  for (int i = 0; i < 10; ++i) {
-    EXPECT_EQ(static_cast<const int*>(data_a.cpu_data())[i],
-        static_cast<const int*>(data_b.cpu_data())[i]);
-  }
-}
-
-#ifndef CPU_ONLY  // GPU Caffe singleton test.
-
-TEST_F(CommonTest, TestRandSeedGPU) {
-  SyncedMemory data_a(10 * sizeof(unsigned int));
-  SyncedMemory data_b(10 * sizeof(unsigned int));
-  Caffe::set_random_seed(1701);
-  CURAND_CHECK(curandGenerate(Caffe::curand_generator(),
-        static_cast<unsigned int*>(data_a.mutable_gpu_data()), 10));
-  Caffe::set_random_seed(1701);
-  CURAND_CHECK(curandGenerate(Caffe::curand_generator(),
-        static_cast<unsigned int*>(data_b.mutable_gpu_data()), 10));
-  for (int i = 0; i < 10; ++i) {
-    EXPECT_EQ(((const unsigned int*)(data_a.cpu_data()))[i],
-        ((const unsigned int*)(data_b.cpu_data()))[i]);
-  }
-}
-
-#endif
-
-}  // namespace caffe
diff --git a/src/caffe/test/test_concat_layer.cpp b/src/caffe/test/test_concat_layer.cpp
deleted file mode 100644
index 23c1e8c..0000000
--- a/src/caffe/test/test_concat_layer.cpp
+++ /dev/null
@@ -1,207 +0,0 @@
-#include <vector>
-
-#include "gtest/gtest.h"
-
-#include "caffe/blob.hpp"
-#include "caffe/common.hpp"
-#include "caffe/filler.hpp"
-#include "caffe/layers/concat_layer.hpp"
-
-#include "caffe/test/test_caffe_main.hpp"
-#include "caffe/test/test_gradient_check_util.hpp"
-
-namespace caffe {
-
-template <typename TypeParam>
-class ConcatLayerTest : public MultiDeviceTest<TypeParam> {
-  typedef typename TypeParam::Dtype Dtype;
-
- protected:
-  ConcatLayerTest()
-      : blob_bottom_0_(new Blob<Dtype>(2, 3, 6, 5)),
-        blob_bottom_1_(new Blob<Dtype>(2, 5, 6, 5)),
-        blob_bottom_2_(new Blob<Dtype>(5, 3, 6, 5)),
-        blob_top_(new Blob<Dtype>()) {}
-  virtual void SetUp() {
-    // fill the values
-    shared_ptr<ConstantFiller<Dtype> > filler;
-    FillerParameter filler_param;
-    filler_param.set_value(1.);
-    filler.reset(new ConstantFiller<Dtype>(filler_param));
-    filler->Fill(this->blob_bottom_0_);
-    filler_param.set_value(2.);
-    filler.reset(new ConstantFiller<Dtype>(filler_param));
-    filler->Fill(this->blob_bottom_1_);
-    filler_param.set_value(3.);
-    filler.reset(new ConstantFiller<Dtype>(filler_param));
-    filler->Fill(this->blob_bottom_2_);
-    blob_bottom_vec_0_.push_back(blob_bottom_0_);
-    blob_bottom_vec_0_.push_back(blob_bottom_1_);
-    blob_bottom_vec_1_.push_back(blob_bottom_0_);
-    blob_bottom_vec_1_.push_back(blob_bottom_2_);
-    blob_top_vec_.push_back(blob_top_);
-  }
-
-  virtual ~ConcatLayerTest() {
-    delete blob_bottom_0_; delete blob_bottom_1_;
-    delete blob_bottom_2_; delete blob_top_;
-  }
-
-  Blob<Dtype>* const blob_bottom_0_;
-  Blob<Dtype>* const blob_bottom_1_;
-  Blob<Dtype>* const blob_bottom_2_;
-  Blob<Dtype>* const blob_top_;
-  vector<Blob<Dtype>*> blob_bottom_vec_0_, blob_bottom_vec_1_;
-  vector<Blob<Dtype>*> blob_top_vec_;
-};
-
-TYPED_TEST_CASE(ConcatLayerTest, TestDtypesAndDevices);
-
-TYPED_TEST(ConcatLayerTest, TestSetupNum) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  layer_param.mutable_concat_param()->set_axis(0);
-  ConcatLayer<Dtype> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_1_, this->blob_top_vec_);
-  EXPECT_EQ(this->blob_top_->num(),
-      this->blob_bottom_0_->num() + this->blob_bottom_2_->num());
-  EXPECT_EQ(this->blob_top_->channels(), this->blob_bottom_0_->channels());
-  EXPECT_EQ(this->blob_top_->height(), this->blob_bottom_0_->height());
-  EXPECT_EQ(this->blob_top_->width(), this->blob_bottom_0_->width());
-}
-
-TYPED_TEST(ConcatLayerTest, TestSetupChannels) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  ConcatLayer<Dtype> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_0_, this->blob_top_vec_);
-  EXPECT_EQ(this->blob_top_->num(), this->blob_bottom_0_->num());
-  EXPECT_EQ(this->blob_top_->channels(),
-      this->blob_bottom_0_->channels() + this->blob_bottom_1_->channels());
-  EXPECT_EQ(this->blob_top_->height(), this->blob_bottom_0_->height());
-  EXPECT_EQ(this->blob_top_->width(), this->blob_bottom_0_->width());
-}
-
-TYPED_TEST(ConcatLayerTest, TestSetupChannelsNegativeIndexing) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  ConcatLayer<Dtype> layer(layer_param);
-  // "channels" index is the third one from the end -- test negative indexing
-  // by setting axis to -3 and checking that we get the same results as above in
-  // TestSetupChannels.
-  layer_param.mutable_concat_param()->set_axis(-3);
-  layer.SetUp(this->blob_bottom_vec_0_, this->blob_top_vec_);
-  EXPECT_EQ(this->blob_top_->num(), this->blob_bottom_0_->num());
-  EXPECT_EQ(this->blob_top_->channels(),
-      this->blob_bottom_0_->channels() + this->blob_bottom_1_->channels());
-  EXPECT_EQ(this->blob_top_->height(), this->blob_bottom_0_->height());
-  EXPECT_EQ(this->blob_top_->width(), this->blob_bottom_0_->width());
-}
-
-TYPED_TEST(ConcatLayerTest, TestForwardTrivial) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  ConcatLayer<Dtype> layer(layer_param);
-  this->blob_bottom_vec_0_.resize(1);
-  layer.SetUp(this->blob_bottom_vec_0_, this->blob_top_vec_);
-  layer.Forward(this->blob_bottom_vec_0_, this->blob_top_vec_);
-  for (int i = 0; i < this->blob_bottom_0_->count(); ++i) {
-    EXPECT_EQ(this->blob_bottom_0_->cpu_data()[i],
-              this->blob_top_->cpu_data()[i]);
-  }
-}
-
-TYPED_TEST(ConcatLayerTest, TestForwardNum) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  layer_param.mutable_concat_param()->set_axis(0);
-  ConcatLayer<Dtype> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_1_, this->blob_top_vec_);
-  layer.Forward(this->blob_bottom_vec_1_, this->blob_top_vec_);
-  for (int n = 0; n < this->blob_bottom_vec_1_[0]->num(); ++n) {
-    for (int c = 0; c < this->blob_top_->channels(); ++c) {
-      for (int h = 0; h < this->blob_top_->height(); ++h) {
-        for (int w = 0; w < this->blob_top_->width(); ++w) {
-          EXPECT_EQ(this->blob_top_->data_at(n, c, h, w),
-              this->blob_bottom_vec_1_[0]->data_at(n, c, h, w));
-        }
-      }
-    }
-  }
-  for (int n = 0; n < this->blob_bottom_vec_1_[1]->num(); ++n) {
-    for (int c = 0; c < this->blob_top_->channels(); ++c) {
-      for (int h = 0; h < this->blob_top_->height(); ++h) {
-        for (int w = 0; w < this->blob_top_->width(); ++w) {
-          EXPECT_EQ(this->blob_top_->data_at(n + 2, c, h, w),
-              this->blob_bottom_vec_1_[1]->data_at(n, c, h, w));
-        }
-      }
-    }
-  }
-}
-
-TYPED_TEST(ConcatLayerTest, TestForwardChannels) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  ConcatLayer<Dtype> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_0_, this->blob_top_vec_);
-  layer.Forward(this->blob_bottom_vec_0_, this->blob_top_vec_);
-  for (int n = 0; n < this->blob_top_->num(); ++n) {
-    for (int c = 0; c < this->blob_bottom_0_->channels(); ++c) {
-      for (int h = 0; h < this->blob_top_->height(); ++h) {
-        for (int w = 0; w < this->blob_top_->width(); ++w) {
-          EXPECT_EQ(this->blob_top_->data_at(n, c, h, w),
-              this->blob_bottom_vec_0_[0]->data_at(n, c, h, w));
-        }
-      }
-    }
-    for (int c = 0; c < this->blob_bottom_1_->channels(); ++c) {
-      for (int h = 0; h < this->blob_top_->height(); ++h) {
-        for (int w = 0; w < this->blob_top_->width(); ++w) {
-          EXPECT_EQ(this->blob_top_->data_at(n, c + 3, h, w),
-              this->blob_bottom_vec_0_[1]->data_at(n, c, h, w));
-        }
-      }
-    }
-  }
-}
-
-TYPED_TEST(ConcatLayerTest, TestGradientTrivial) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  ConcatLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-2, 1e-2);
-  this->blob_bottom_vec_0_.resize(1);
-  checker.CheckGradientEltwise(&layer, this->blob_bottom_vec_0_,
-      this->blob_top_vec_);
-}
-
-TYPED_TEST(ConcatLayerTest, TestGradientNum) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  layer_param.mutable_concat_param()->set_axis(0);
-  ConcatLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-2, 1e-2);
-  checker.CheckGradient(&layer, this->blob_bottom_vec_1_,
-    this->blob_top_vec_);
-}
-
-TYPED_TEST(ConcatLayerTest, TestGradientChannels) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  ConcatLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-2, 1e-2);
-  checker.CheckGradient(&layer, this->blob_bottom_vec_0_,
-    this->blob_top_vec_);
-}
-
-TYPED_TEST(ConcatLayerTest, TestGradientChannelsBottomOneOnly) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  ConcatLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-2, 1e-2);
-  checker.CheckGradient(&layer, this->blob_bottom_vec_0_,
-    this->blob_top_vec_, 1);
-}
-
-}  // namespace caffe
diff --git a/src/caffe/test/test_contrastive_loss_layer.cpp b/src/caffe/test/test_contrastive_loss_layer.cpp
deleted file mode 100644
index 2fa055e..0000000
--- a/src/caffe/test/test_contrastive_loss_layer.cpp
+++ /dev/null
@@ -1,144 +0,0 @@
-#include <algorithm>
-#include <cmath>
-#include <vector>
-
-#include "gtest/gtest.h"
-
-#include "caffe/blob.hpp"
-#include "caffe/common.hpp"
-#include "caffe/filler.hpp"
-#include "caffe/layers/contrastive_loss_layer.hpp"
-
-#include "caffe/test/test_caffe_main.hpp"
-#include "caffe/test/test_gradient_check_util.hpp"
-
-namespace caffe {
-
-template <typename TypeParam>
-class ContrastiveLossLayerTest : public MultiDeviceTest<TypeParam> {
-  typedef typename TypeParam::Dtype Dtype;
-
- protected:
-  ContrastiveLossLayerTest()
-      : blob_bottom_data_i_(new Blob<Dtype>(512, 2, 1, 1)),
-        blob_bottom_data_j_(new Blob<Dtype>(512, 2, 1, 1)),
-        blob_bottom_y_(new Blob<Dtype>(512, 1, 1, 1)),
-        blob_top_loss_(new Blob<Dtype>()) {
-    // fill the values
-    FillerParameter filler_param;
-    filler_param.set_min(-1.0);
-    filler_param.set_max(1.0);  // distances~=1.0 to test both sides of margin
-    UniformFiller<Dtype> filler(filler_param);
-    filler.Fill(this->blob_bottom_data_i_);
-    blob_bottom_vec_.push_back(blob_bottom_data_i_);
-    filler.Fill(this->blob_bottom_data_j_);
-    blob_bottom_vec_.push_back(blob_bottom_data_j_);
-    for (int i = 0; i < blob_bottom_y_->count(); ++i) {
-      blob_bottom_y_->mutable_cpu_data()[i] = caffe_rng_rand() % 2;  // 0 or 1
-    }
-    blob_bottom_vec_.push_back(blob_bottom_y_);
-    blob_top_vec_.push_back(blob_top_loss_);
-  }
-  virtual ~ContrastiveLossLayerTest() {
-    delete blob_bottom_data_i_;
-    delete blob_bottom_data_j_;
-    delete blob_bottom_y_;
-    delete blob_top_loss_;
-  }
-
-  Blob<Dtype>* const blob_bottom_data_i_;
-  Blob<Dtype>* const blob_bottom_data_j_;
-  Blob<Dtype>* const blob_bottom_y_;
-  Blob<Dtype>* const blob_top_loss_;
-  vector<Blob<Dtype>*> blob_bottom_vec_;
-  vector<Blob<Dtype>*> blob_top_vec_;
-};
-
-TYPED_TEST_CASE(ContrastiveLossLayerTest, TestDtypesAndDevices);
-
-TYPED_TEST(ContrastiveLossLayerTest, TestForward) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  ContrastiveLossLayer<Dtype> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  // manually compute to compare
-  const Dtype margin = layer_param.contrastive_loss_param().margin();
-  const int num = this->blob_bottom_data_i_->num();
-  const int channels = this->blob_bottom_data_i_->channels();
-  Dtype loss(0);
-  for (int i = 0; i < num; ++i) {
-    Dtype dist_sq(0);
-    for (int j = 0; j < channels; ++j) {
-      Dtype diff = this->blob_bottom_data_i_->cpu_data()[i*channels+j] -
-          this->blob_bottom_data_j_->cpu_data()[i*channels+j];
-      dist_sq += diff*diff;
-    }
-    if (this->blob_bottom_y_->cpu_data()[i]) {  // similar pairs
-      loss += dist_sq;
-    } else {
-      Dtype dist = std::max<Dtype>(margin - sqrt(dist_sq), 0.0);
-      loss += dist*dist;
-    }
-  }
-  loss /= static_cast<Dtype>(num) * Dtype(2);
-  EXPECT_NEAR(this->blob_top_loss_->cpu_data()[0], loss, 1e-6);
-}
-
-TYPED_TEST(ContrastiveLossLayerTest, TestGradient) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  ContrastiveLossLayer<Dtype> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  GradientChecker<Dtype> checker(1e-2, 1e-2, 1701);
-  // check the gradient for the first two bottom layers
-  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_, 0);
-  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_, 1);
-}
-
-TYPED_TEST(ContrastiveLossLayerTest, TestForwardLegacy) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  layer_param.mutable_contrastive_loss_param()->set_legacy_version(true);
-  ContrastiveLossLayer<Dtype> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  // manually compute to compare
-  const Dtype margin = layer_param.contrastive_loss_param().margin();
-  const int num = this->blob_bottom_data_i_->num();
-  const int channels = this->blob_bottom_data_i_->channels();
-  Dtype loss(0);
-  for (int i = 0; i < num; ++i) {
-    Dtype dist_sq(0);
-    for (int j = 0; j < channels; ++j) {
-      Dtype diff = this->blob_bottom_data_i_->cpu_data()[i*channels+j] -
-          this->blob_bottom_data_j_->cpu_data()[i*channels+j];
-      dist_sq += diff*diff;
-    }
-    if (this->blob_bottom_y_->cpu_data()[i]) {  // similar pairs
-      loss += dist_sq;
-    } else {
-      loss += std::max(margin - dist_sq, Dtype(0.0));
-    }
-  }
-  loss /= static_cast<Dtype>(num) * Dtype(2);
-  EXPECT_NEAR(this->blob_top_loss_->cpu_data()[0], loss, 1e-6);
-}
-
-TYPED_TEST(ContrastiveLossLayerTest, TestGradientLegacy) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  layer_param.mutable_contrastive_loss_param()->set_legacy_version(true);
-  ContrastiveLossLayer<Dtype> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  GradientChecker<Dtype> checker(1e-2, 1e-2, 1701);
-  // check the gradient for the first two bottom layers
-  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_, 0);
-  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_, 1);
-}
-
-}  // namespace caffe
diff --git a/src/caffe/test/test_convolution_layer.cpp b/src/caffe/test/test_convolution_layer.cpp
deleted file mode 100644
index 9bb19d1..0000000
--- a/src/caffe/test/test_convolution_layer.cpp
+++ /dev/null
@@ -1,1086 +0,0 @@
-#include <vector>
-
-#include "gtest/gtest.h"
-
-#include "caffe/blob.hpp"
-#include "caffe/common.hpp"
-#include "caffe/filler.hpp"
-#include "caffe/layers/conv_layer.hpp"
-
-#ifdef USE_CUDNN
-#include "caffe/layers/cudnn_conv_layer.hpp"
-#endif
-
-#include "caffe/test/test_caffe_main.hpp"
-#include "caffe/test/test_gradient_check_util.hpp"
-
-namespace caffe {
-
-// Reference convolution for checking results:
-// accumulate through explicit loops over input, output, and filters.
-template <typename Dtype>
-void caffe_conv(const Blob<Dtype>* in, ConvolutionParameter* conv_param,
-    const vector<shared_ptr<Blob<Dtype> > >& weights,
-    Blob<Dtype>* out) {
-  const bool has_depth = (out->num_axes() == 5);
-  if (!has_depth) { CHECK_EQ(4, out->num_axes()); }
-  // Kernel size, stride, and pad
-  int kernel_h, kernel_w;
-  if (conv_param->has_kernel_h() || conv_param->has_kernel_w()) {
-    kernel_h = conv_param->kernel_h();
-    kernel_w = conv_param->kernel_w();
-  } else {
-    kernel_h = kernel_w = conv_param->kernel_size(0);
-  }
-  int pad_h, pad_w;
-  if (conv_param->has_pad_h() || conv_param->has_pad_w()) {
-    pad_h = conv_param->pad_h();
-    pad_w = conv_param->pad_w();
-  } else {
-    pad_h = pad_w = conv_param->pad_size() ? conv_param->pad(0) : 0;
-  }
-  int stride_h, stride_w;
-  if (conv_param->has_stride_h() || conv_param->has_stride_w()) {
-    stride_h = conv_param->stride_h();
-    stride_w = conv_param->stride_w();
-  } else {
-    stride_h = stride_w = conv_param->stride_size() ? conv_param->stride(0) : 1;
-  }
-  int dilation_h, dilation_w;
-  dilation_h = dilation_w = conv_param->dilation_size() ?
-                            conv_param->dilation(0) : 1;
-  int kernel_d, pad_d, stride_d, dilation_d;
-  if (has_depth) {
-    kernel_d = kernel_h;
-    stride_d = stride_h;
-    pad_d = pad_h;
-    dilation_d = dilation_h;
-  } else {
-    kernel_d = stride_d = dilation_d = 1;
-    pad_d = 0;
-  }
-  // Groups
-  int groups = conv_param->group();
-  int o_g = out->shape(1) / groups;
-  int k_g = in->shape(1) / groups;
-  int o_head, k_head;
-  // Convolution
-  vector<int> weight_offset(4 + has_depth);
-  vector<int> in_offset(4 + has_depth);
-  vector<int> out_offset(4 + has_depth);
-  Dtype* out_data = out->mutable_cpu_data();
-  for (int n = 0; n < out->shape(0); n++) {
-    for (int g = 0; g < groups; g++) {
-      o_head = o_g * g;
-      k_head = k_g * g;
-      for (int o = 0; o < o_g; o++) {
-        for (int k = 0; k < k_g; k++) {
-          for (int z = 0; z < (has_depth ? out->shape(2) : 1); z++) {
-            for (int y = 0; y < out->shape(2 + has_depth); y++) {
-              for (int x = 0; x < out->shape(3 + has_depth); x++) {
-                for (int r = 0; r < kernel_d; r++) {
-                  for (int p = 0; p < kernel_h; p++) {
-                    for (int q = 0; q < kernel_w; q++) {
-                      int in_z = z * stride_d - pad_d + r * dilation_d;
-                      int in_y = y * stride_h - pad_h + p * dilation_h;
-                      int in_x = x * stride_w - pad_w + q * dilation_w;
-                      if (in_z >= 0 && in_z < (has_depth ? in->shape(2) : 1)
-                          && in_y >= 0 && in_y < in->shape(2 + has_depth)
-                          && in_x >= 0 && in_x < in->shape(3 + has_depth)) {
-                        weight_offset[0] = o + o_head;
-                        weight_offset[1] = k;
-                        if (has_depth) { weight_offset[2] = r; }
-                        weight_offset[2 + has_depth] = p;
-                        weight_offset[3 + has_depth] = q;
-                        in_offset[0] = n;
-                        in_offset[1] = k + k_head;
-                        if (has_depth) { in_offset[2] = in_z; }
-                        in_offset[2 + has_depth] = in_y;
-                        in_offset[3 + has_depth] = in_x;
-                        out_offset[0] = n;
-                        out_offset[1] = o + o_head;
-                        if (has_depth) { out_offset[2] = z; }
-                        out_offset[2 + has_depth] = y;
-                        out_offset[3 + has_depth] = x;
-                        out_data[out->offset(out_offset)] +=
-                            in->data_at(in_offset)
-                            * weights[0]->data_at(weight_offset);
-                      }
-                    }
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-  // Bias
-  if (conv_param->bias_term()) {
-    const Dtype* bias_data = weights[1]->cpu_data();
-    for (int n = 0; n < out->shape(0); n++) {
-      for (int o = 0; o < out->shape(1); o++) {
-        for (int z = 0; z < (has_depth ? out->shape(2) : 1); z++) {
-          for (int y = 0; y < out->shape(2 + has_depth); y++) {
-            for (int x = 0; x < out->shape(3 + has_depth); x++) {
-              out_offset[0] = n;
-              out_offset[1] = o;
-              if (has_depth) { out_offset[2] = z; }
-              out_offset[2 + has_depth] = y;
-              out_offset[3 + has_depth] = x;
-              out_data[out->offset(out_offset)] += bias_data[o];
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-template void caffe_conv(const Blob<float>* in,
-    ConvolutionParameter* conv_param,
-    const vector<shared_ptr<Blob<float> > >& weights,
-    Blob<float>* out);
-template void caffe_conv(const Blob<double>* in,
-    ConvolutionParameter* conv_param,
-    const vector<shared_ptr<Blob<double> > >& weights,
-    Blob<double>* out);
-
-template <typename TypeParam>
-class ConvolutionLayerTest : public MultiDeviceTest<TypeParam> {
-  typedef typename TypeParam::Dtype Dtype;
-
- protected:
-  ConvolutionLayerTest()
-      : blob_bottom_(new Blob<Dtype>(2, 3, 6, 4)),
-        blob_bottom_2_(new Blob<Dtype>(2, 3, 6, 4)),
-        blob_top_(new Blob<Dtype>()),
-        blob_top_2_(new Blob<Dtype>()) {}
-  virtual void SetUp() {
-    // fill the values
-    FillerParameter filler_param;
-    filler_param.set_value(1.);
-    GaussianFiller<Dtype> filler(filler_param);
-    filler.Fill(this->blob_bottom_);
-    filler.Fill(this->blob_bottom_2_);
-    blob_bottom_vec_.push_back(blob_bottom_);
-    blob_top_vec_.push_back(blob_top_);
-  }
-
-  virtual ~ConvolutionLayerTest() {
-    delete blob_bottom_;
-    delete blob_bottom_2_;
-    delete blob_top_;
-    delete blob_top_2_;
-  }
-
-  virtual Blob<Dtype>* MakeReferenceTop(Blob<Dtype>* top) {
-    this->ref_blob_top_.reset(new Blob<Dtype>());
-    this->ref_blob_top_->ReshapeLike(*top);
-    return this->ref_blob_top_.get();
-  }
-
-  Blob<Dtype>* const blob_bottom_;
-  Blob<Dtype>* const blob_bottom_2_;
-  Blob<Dtype>* const blob_top_;
-  Blob<Dtype>* const blob_top_2_;
-  shared_ptr<Blob<Dtype> > ref_blob_top_;
-  vector<Blob<Dtype>*> blob_bottom_vec_;
-  vector<Blob<Dtype>*> blob_top_vec_;
-};
-
-TYPED_TEST_CASE(ConvolutionLayerTest, TestDtypesAndDevices);
-
-TYPED_TEST(ConvolutionLayerTest, TestSetup) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  ConvolutionParameter* convolution_param =
-      layer_param.mutable_convolution_param();
-  convolution_param->add_kernel_size(3);
-  convolution_param->add_stride(2);
-  convolution_param->set_num_output(4);
-  this->blob_bottom_vec_.push_back(this->blob_bottom_2_);
-  this->blob_top_vec_.push_back(this->blob_top_2_);
-  shared_ptr<Layer<Dtype> > layer(
-      new ConvolutionLayer<Dtype>(layer_param));
-  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  EXPECT_EQ(this->blob_top_->num(), 2);
-  EXPECT_EQ(this->blob_top_->channels(), 4);
-  EXPECT_EQ(this->blob_top_->height(), 2);
-  EXPECT_EQ(this->blob_top_->width(), 1);
-  EXPECT_EQ(this->blob_top_2_->num(), 2);
-  EXPECT_EQ(this->blob_top_2_->channels(), 4);
-  EXPECT_EQ(this->blob_top_2_->height(), 2);
-  EXPECT_EQ(this->blob_top_2_->width(), 1);
-  // setting group should not change the shape
-  convolution_param->set_num_output(3);
-  convolution_param->set_group(3);
-  layer.reset(new ConvolutionLayer<Dtype>(layer_param));
-  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  EXPECT_EQ(this->blob_top_->num(), 2);
-  EXPECT_EQ(this->blob_top_->channels(), 3);
-  EXPECT_EQ(this->blob_top_->height(), 2);
-  EXPECT_EQ(this->blob_top_->width(), 1);
-  EXPECT_EQ(this->blob_top_2_->num(), 2);
-  EXPECT_EQ(this->blob_top_2_->channels(), 3);
-  EXPECT_EQ(this->blob_top_2_->height(), 2);
-  EXPECT_EQ(this->blob_top_2_->width(), 1);
-}
-
-TYPED_TEST(ConvolutionLayerTest, TestSimpleConvolution) {
-  typedef typename TypeParam::Dtype Dtype;
-  this->blob_bottom_vec_.push_back(this->blob_bottom_2_);
-  this->blob_top_vec_.push_back(this->blob_top_2_);
-  LayerParameter layer_param;
-  ConvolutionParameter* convolution_param =
-      layer_param.mutable_convolution_param();
-  convolution_param->add_kernel_size(3);
-  convolution_param->add_stride(2);
-  convolution_param->set_num_output(4);
-  convolution_param->mutable_weight_filler()->set_type("gaussian");
-  convolution_param->mutable_bias_filler()->set_type("constant");
-  convolution_param->mutable_bias_filler()->set_value(0.1);
-  shared_ptr<Layer<Dtype> > layer(
-      new ConvolutionLayer<Dtype>(layer_param));
-  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  // Check against reference convolution.
-  const Dtype* top_data;
-  const Dtype* ref_top_data;
-  caffe_conv(this->blob_bottom_, convolution_param, layer->blobs(),
-      this->MakeReferenceTop(this->blob_top_));
-  top_data = this->blob_top_->cpu_data();
-  ref_top_data = this->ref_blob_top_->cpu_data();
-  for (int i = 0; i < this->blob_top_->count(); ++i) {
-    EXPECT_NEAR(top_data[i], ref_top_data[i], 1e-4);
-  }
-  caffe_conv(this->blob_bottom_2_, convolution_param, layer->blobs(),
-      this->MakeReferenceTop(this->blob_top_2_));
-  top_data = this->blob_top_2_->cpu_data();
-  ref_top_data = this->ref_blob_top_->cpu_data();
-  for (int i = 0; i < this->blob_top_->count(); ++i) {
-    EXPECT_NEAR(top_data[i], ref_top_data[i], 1e-4);
-  }
-}
-
-TYPED_TEST(ConvolutionLayerTest, TestDilatedConvolution) {
-  typedef typename TypeParam::Dtype Dtype;
-  vector<int> bottom_shape;
-  bottom_shape.push_back(2);
-  bottom_shape.push_back(3);
-  bottom_shape.push_back(8);
-  bottom_shape.push_back(7);
-  this->blob_bottom_vec_.push_back(this->blob_bottom_2_);
-  this->blob_top_vec_.push_back(this->blob_top_2_);
-  for (int i = 0; i < this->blob_bottom_vec_.size(); ++i) {
-    this->blob_bottom_vec_[i]->Reshape(bottom_shape);
-  }
-  LayerParameter layer_param;
-  ConvolutionParameter* convolution_param =
-      layer_param.mutable_convolution_param();
-  convolution_param->add_kernel_size(3);
-  convolution_param->add_dilation(2);
-  convolution_param->set_num_output(4);
-  convolution_param->mutable_weight_filler()->set_type("gaussian");
-  convolution_param->mutable_bias_filler()->set_type("constant");
-  convolution_param->mutable_bias_filler()->set_value(0.1);
-  shared_ptr<Layer<Dtype> > layer(
-      new ConvolutionLayer<Dtype>(layer_param));
-  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  // Check against reference convolution.
-  const Dtype* top_data;
-  const Dtype* ref_top_data;
-  caffe_conv(this->blob_bottom_, convolution_param, layer->blobs(),
-             this->MakeReferenceTop(this->blob_top_));
-  top_data = this->blob_top_->cpu_data();
-  ref_top_data = this->ref_blob_top_->cpu_data();
-  for (int i = 0; i < this->blob_top_->count(); ++i) {
-    EXPECT_NEAR(top_data[i], ref_top_data[i], 1e-4);
-  }
-  caffe_conv(this->blob_bottom_2_, convolution_param, layer->blobs(),
-             this->MakeReferenceTop(this->blob_top_2_));
-  top_data = this->blob_top_2_->cpu_data();
-  ref_top_data = this->ref_blob_top_->cpu_data();
-  for (int i = 0; i < this->blob_top_->count(); ++i) {
-    EXPECT_NEAR(top_data[i], ref_top_data[i], 1e-4);
-  }
-}
-
-TYPED_TEST(ConvolutionLayerTest, Test0DConvolution) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  ConvolutionParameter* convolution_param =
-      layer_param.mutable_convolution_param();
-  const int kNumOutput = 3;
-  convolution_param->set_num_output(kNumOutput);
-  convolution_param->set_axis(3);
-  convolution_param->mutable_weight_filler()->set_type("gaussian");
-  convolution_param->mutable_bias_filler()->set_type("gaussian");
-  shared_ptr<Layer<Dtype> > layer(
-      new ConvolutionLayer<Dtype>(layer_param));
-  vector<int> top_shape = this->blob_bottom_->shape();
-  top_shape[3] = kNumOutput;
-  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  EXPECT_EQ(top_shape, this->blob_top_->shape());
-  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  // Check against reference convolution.
-  vector<int> weight_offset(2);
-  const Blob<Dtype>* weight = layer->blobs()[0].get();
-  const Blob<Dtype>* bias = layer->blobs()[1].get();
-  const int num = this->blob_top_->count(3);
-  const int dim = this->blob_top_->shape(3);
-  const int bottom_dim = this->blob_bottom_->shape(3);
-  for (int n = 0; n < num; ++n) {
-    for (int d = 0; d < dim; ++d) {
-      weight_offset[0] = d;
-      Dtype value = bias->cpu_data()[d];
-      for (int bottom_d = 0; bottom_d < bottom_dim; ++bottom_d) {
-        weight_offset[1] = bottom_d;
-        value += weight->data_at(weight_offset) *
-                 this->blob_bottom_->cpu_data()[n * bottom_dim + bottom_d];
-      }
-      EXPECT_NEAR(value, this->blob_top_->cpu_data()[n * dim + d], 1e-4);
-    }
-  }
-}
-
-TYPED_TEST(ConvolutionLayerTest, TestSimple3DConvolution) {
-  typedef typename TypeParam::Dtype Dtype;
-  this->blob_bottom_vec_.push_back(this->blob_bottom_2_);
-  this->blob_top_vec_.push_back(this->blob_top_2_);
-  vector<int> bottom_shape(5);
-  bottom_shape[0] = this->blob_bottom_vec_[0]->shape(0);
-  bottom_shape[1] = this->blob_bottom_vec_[0]->shape(1);
-  bottom_shape[2] = 5;
-  bottom_shape[3] = this->blob_bottom_vec_[0]->shape(2);
-  bottom_shape[4] = this->blob_bottom_vec_[0]->shape(3);
-  FillerParameter filler_param;
-  GaussianFiller<Dtype> filler(filler_param);
-  for (int i = 0; i < this->blob_bottom_vec_.size(); ++i) {
-    this->blob_bottom_vec_[i]->Reshape(bottom_shape);
-    filler.Fill(this->blob_bottom_vec_[i]);
-  }
-  LayerParameter layer_param;
-  ConvolutionParameter* convolution_param =
-      layer_param.mutable_convolution_param();
-  convolution_param->add_kernel_size(3);
-  convolution_param->add_stride(2);
-  convolution_param->set_num_output(4);
-  convolution_param->mutable_weight_filler()->set_type("gaussian");
-  convolution_param->mutable_bias_filler()->set_type("gaussian");
-  shared_ptr<Layer<Dtype> > layer(
-      new ConvolutionLayer<Dtype>(layer_param));
-  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  // Check against reference convolution.
-  const Dtype* top_data;
-  const Dtype* ref_top_data;
-  caffe_conv(this->blob_bottom_, convolution_param, layer->blobs(),
-      this->MakeReferenceTop(this->blob_top_));
-  top_data = this->blob_top_->cpu_data();
-  ref_top_data = this->ref_blob_top_->cpu_data();
-  for (int i = 0; i < this->blob_top_->count(); ++i) {
-    EXPECT_NEAR(top_data[i], ref_top_data[i], 1e-4);
-  }
-  caffe_conv(this->blob_bottom_2_, convolution_param, layer->blobs(),
-      this->MakeReferenceTop(this->blob_top_2_));
-  top_data = this->blob_top_2_->cpu_data();
-  ref_top_data = this->ref_blob_top_->cpu_data();
-  for (int i = 0; i < this->blob_top_->count(); ++i) {
-    EXPECT_NEAR(top_data[i], ref_top_data[i], 1e-4);
-  }
-}
-
-TYPED_TEST(ConvolutionLayerTest, TestDilated3DConvolution) {
-  typedef typename TypeParam::Dtype Dtype;
-  this->blob_bottom_vec_.push_back(this->blob_bottom_2_);
-  this->blob_top_vec_.push_back(this->blob_top_2_);
-  vector<int> bottom_shape(5);
-  bottom_shape[0] = this->blob_bottom_vec_[0]->shape(0);
-  bottom_shape[1] = this->blob_bottom_vec_[0]->shape(1);
-  bottom_shape[2] = 6;
-  bottom_shape[3] = 7;
-  bottom_shape[4] = 8;
-  FillerParameter filler_param;
-  GaussianFiller<Dtype> filler(filler_param);
-  for (int i = 0; i < this->blob_bottom_vec_.size(); ++i) {
-    this->blob_bottom_vec_[i]->Reshape(bottom_shape);
-    filler.Fill(this->blob_bottom_vec_[i]);
-  }
-  LayerParameter layer_param;
-  ConvolutionParameter* convolution_param =
-      layer_param.mutable_convolution_param();
-  convolution_param->add_kernel_size(3);
-  convolution_param->add_dilation(2);
-  convolution_param->set_num_output(4);
-  convolution_param->mutable_weight_filler()->set_type("gaussian");
-  convolution_param->mutable_bias_filler()->set_type("gaussian");
-  shared_ptr<Layer<Dtype> > layer(
-      new ConvolutionLayer<Dtype>(layer_param));
-  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  // Check against reference convolution.
-  const Dtype* top_data;
-  const Dtype* ref_top_data;
-  caffe_conv(this->blob_bottom_, convolution_param, layer->blobs(),
-             this->MakeReferenceTop(this->blob_top_));
-  top_data = this->blob_top_->cpu_data();
-  ref_top_data = this->ref_blob_top_->cpu_data();
-  for (int i = 0; i < this->blob_top_->count(); ++i) {
-    EXPECT_NEAR(top_data[i], ref_top_data[i], 1e-4);
-  }
-  caffe_conv(this->blob_bottom_2_, convolution_param, layer->blobs(),
-             this->MakeReferenceTop(this->blob_top_2_));
-  top_data = this->blob_top_2_->cpu_data();
-  ref_top_data = this->ref_blob_top_->cpu_data();
-  for (int i = 0; i < this->blob_top_->count(); ++i) {
-    EXPECT_NEAR(top_data[i], ref_top_data[i], 1e-4);
-  }
-}
-
-TYPED_TEST(ConvolutionLayerTest, Test1x1Convolution) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  ConvolutionParameter* convolution_param =
-      layer_param.mutable_convolution_param();
-  convolution_param->add_kernel_size(1);
-  convolution_param->add_stride(1);
-  convolution_param->set_num_output(4);
-  convolution_param->mutable_weight_filler()->set_type("gaussian");
-  convolution_param->mutable_bias_filler()->set_type("constant");
-  convolution_param->mutable_bias_filler()->set_value(0.1);
-  shared_ptr<Layer<Dtype> > layer(
-      new ConvolutionLayer<Dtype>(layer_param));
-  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  // Check against reference convolution.
-  const Dtype* top_data;
-  const Dtype* ref_top_data;
-  caffe_conv(this->blob_bottom_, convolution_param, layer->blobs(),
-      this->MakeReferenceTop(this->blob_top_));
-  top_data = this->blob_top_->cpu_data();
-  ref_top_data = this->ref_blob_top_->cpu_data();
-  for (int i = 0; i < this->blob_top_->count(); ++i) {
-    EXPECT_NEAR(top_data[i], ref_top_data[i], 1e-4);
-  }
-}
-
-TYPED_TEST(ConvolutionLayerTest, TestSimpleConvolutionGroup) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  ConvolutionParameter* convolution_param =
-      layer_param.mutable_convolution_param();
-  convolution_param->add_kernel_size(3);
-  convolution_param->add_stride(2);
-  convolution_param->set_num_output(3);
-  convolution_param->set_group(3);
-  convolution_param->mutable_weight_filler()->set_type("gaussian");
-  convolution_param->mutable_bias_filler()->set_type("constant");
-  convolution_param->mutable_bias_filler()->set_value(0.1);
-  shared_ptr<Layer<Dtype> > layer(
-      new ConvolutionLayer<Dtype>(layer_param));
-  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  // Check against reference convolution.
-  const Dtype* top_data;
-  const Dtype* ref_top_data;
-  caffe_conv(this->blob_bottom_, convolution_param, layer->blobs(),
-      this->MakeReferenceTop(this->blob_top_));
-  top_data = this->blob_top_->cpu_data();
-  ref_top_data = this->ref_blob_top_->cpu_data();
-  for (int i = 0; i < this->blob_top_->count(); ++i) {
-    EXPECT_NEAR(top_data[i], ref_top_data[i], 1e-4);
-  }
-}
-
-TYPED_TEST(ConvolutionLayerTest, TestSobelConvolution) {
-  // Test separable convolution by computing the Sobel operator
-  // as a single filter then comparing the result
-  // as the convolution of two rectangular filters.
-  typedef typename TypeParam::Dtype Dtype;
-  // Fill bottoms with identical Gaussian noise.
-  shared_ptr<GaussianFiller<Dtype> > filler;
-  FillerParameter filler_param;
-  filler_param.set_value(1.);
-  filler.reset(new GaussianFiller<Dtype>(filler_param));
-  filler->Fill(this->blob_bottom_);
-  this->blob_bottom_2_->CopyFrom(*this->blob_bottom_);
-  // Compute Sobel G_x operator as 3 x 3 convolution.
-  LayerParameter layer_param;
-  ConvolutionParameter* convolution_param =
-      layer_param.mutable_convolution_param();
-  convolution_param->add_kernel_size(3);
-  convolution_param->add_stride(2);
-  convolution_param->set_num_output(1);
-  convolution_param->set_bias_term(false);
-  shared_ptr<Layer<Dtype> > layer(
-      new ConvolutionLayer<Dtype>(layer_param));
-  layer->blobs().resize(1);
-  layer->blobs()[0].reset(new Blob<Dtype>(1, 3, 3, 3));
-  Dtype* weights = layer->blobs()[0]->mutable_cpu_data();
-  for (int c = 0; c < 3; ++c) {
-    int i = c * 9;  // 3 x 3 filter
-    weights[i +  0] = -1;
-    weights[i +  1] =  0;
-    weights[i +  2] =  1;
-    weights[i +  3] = -2;
-    weights[i +  4] =  0;
-    weights[i +  5] =  2;
-    weights[i +  6] = -1;
-    weights[i +  7] =  0;
-    weights[i +  8] =  1;
-  }
-  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  // Compute Sobel G_x operator as separable 3 x 1 and 1 x 3 convolutions.
-  // (1) the [1 2 1] column filter
-  vector<Blob<Dtype>*> sep_blob_bottom_vec;
-  vector<Blob<Dtype>*> sep_blob_top_vec;
-  shared_ptr<Blob<Dtype> > blob_sep(new Blob<Dtype>());
-  sep_blob_bottom_vec.push_back(this->blob_bottom_2_);
-  sep_blob_top_vec.push_back(this->blob_top_2_);
-  convolution_param->clear_kernel_size();
-  convolution_param->clear_stride();
-  convolution_param->set_kernel_h(3);
-  convolution_param->set_kernel_w(1);
-  convolution_param->set_stride_h(2);
-  convolution_param->set_stride_w(1);
-  convolution_param->set_num_output(1);
-  convolution_param->set_bias_term(false);
-  layer.reset(new ConvolutionLayer<Dtype>(layer_param));
-  layer->blobs().resize(1);
-  layer->blobs()[0].reset(new Blob<Dtype>(1, 3, 3, 1));
-  Dtype* weights_1 = layer->blobs()[0]->mutable_cpu_data();
-  for (int c = 0; c < 3; ++c) {
-    int i = c * 3;  // 3 x 1 filter
-    weights_1[i +  0] = 1;
-    weights_1[i +  1] = 2;
-    weights_1[i +  2] = 1;
-  }
-  layer->SetUp(sep_blob_bottom_vec, sep_blob_top_vec);
-  layer->Forward(sep_blob_bottom_vec, sep_blob_top_vec);
-  // (2) the [-1 0 1] row filter
-  blob_sep->CopyFrom(*this->blob_top_2_, false, true);
-  sep_blob_bottom_vec.clear();
-  sep_blob_bottom_vec.push_back(blob_sep.get());
-  convolution_param->set_kernel_h(1);
-  convolution_param->set_kernel_w(3);
-  convolution_param->set_stride_h(1);
-  convolution_param->set_stride_w(2);
-  convolution_param->set_num_output(1);
-  convolution_param->set_bias_term(false);
-  layer.reset(new ConvolutionLayer<Dtype>(layer_param));
-  layer->blobs().resize(1);
-  layer->blobs()[0].reset(new Blob<Dtype>(1, 1, 1, 3));
-  Dtype* weights_2 = layer->blobs()[0]->mutable_cpu_data();
-  weights_2[0] = -1;
-  weights_2[1] =  0;
-  weights_2[2] =  1;
-  layer->SetUp(sep_blob_bottom_vec, sep_blob_top_vec);
-  layer->Forward(sep_blob_bottom_vec, sep_blob_top_vec);
-  // Test equivalence of full and separable filters.
-  const Dtype* top_data = this->blob_top_->cpu_data();
-  const Dtype* sep_top_data = this->blob_top_2_->cpu_data();
-  for (int i = 0; i < this->blob_top_->count(); ++i) {
-    EXPECT_NEAR(top_data[i], sep_top_data[i], 1e-4);
-  }
-}
-
-TYPED_TEST(ConvolutionLayerTest, TestNDAgainst2D) {
-  typedef typename TypeParam::Dtype Dtype;
-  const int kernel_h = 11;
-  const int kernel_w = 13;
-  vector<int> bottom_shape(4);
-  bottom_shape[0] = 15;
-  bottom_shape[1] = 18;
-  bottom_shape[2] = kernel_h * 2;
-  bottom_shape[3] = kernel_w * 2;
-  FillerParameter filler_param;
-  GaussianFiller<Dtype> filler(filler_param);
-  for (int i = 0; i < this->blob_bottom_vec_.size(); ++i) {
-    this->blob_bottom_vec_[i]->Reshape(bottom_shape);
-    filler.Fill(this->blob_bottom_vec_[i]);
-  }
-  LayerParameter layer_param;
-  ConvolutionParameter* convolution_param =
-      layer_param.mutable_convolution_param();
-  convolution_param->set_num_output(12);
-  convolution_param->set_bias_term(false);
-  convolution_param->set_group(6);
-  convolution_param->set_kernel_h(kernel_h);
-  convolution_param->set_kernel_w(kernel_w);
-  convolution_param->mutable_weight_filler()->set_type("gaussian");
-  Blob<Dtype> weights;
-  Blob<Dtype> top_diff;
-  // Shape and fill weights and top_diff.
-  bool copy_diff;
-  bool reshape;
-  {
-    ConvolutionLayer<Dtype> layer(layer_param);
-    layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-    top_diff.ReshapeLike(*this->blob_top_);
-    filler.Fill(&top_diff);
-    ASSERT_EQ(1, layer.blobs().size());
-    copy_diff = false; reshape = true;
-    weights.CopyFrom(*layer.blobs()[0], copy_diff, reshape);
-  }
-  vector<bool> propagate_down(1, true);
-  Blob<Dtype> result_2d;
-  Blob<Dtype> backward_result_2d;
-  Blob<Dtype> backward_weight_result_2d;
-  // Test with 2D im2col
-  {
-    caffe_set(this->blob_top_->count(), Dtype(0),
-              this->blob_top_->mutable_cpu_data());
-    caffe_set(this->blob_bottom_->count(), Dtype(0),
-              this->blob_bottom_->mutable_cpu_diff());
-    caffe_set(weights.count(), Dtype(0), weights.mutable_cpu_diff());
-    // Do SetUp and Forward; save Forward result in result_2d.
-    convolution_param->set_force_nd_im2col(false);
-    ConvolutionLayer<Dtype> layer_2d(layer_param);
-    layer_2d.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-    ASSERT_EQ(1, layer_2d.blobs().size());
-    copy_diff = false; reshape = false;
-    layer_2d.blobs()[0]->CopyFrom(weights, copy_diff, reshape);
-    layer_2d.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-    copy_diff = false; reshape = true;
-    result_2d.CopyFrom(*this->blob_top_, copy_diff, reshape);
-    // Copy pre-generated top diff into actual top diff;
-    // do Backward and save result in backward_result_2d.
-    ASSERT_EQ(this->blob_top_->shape(), top_diff.shape());
-    caffe_copy(top_diff.count(), top_diff.cpu_data(),
-               this->blob_top_->mutable_cpu_diff());
-    layer_2d.Backward(this->blob_top_vec_, propagate_down,
-                      this->blob_bottom_vec_);
-    copy_diff = true; reshape = true;
-    backward_result_2d.CopyFrom(*this->blob_bottom_, copy_diff, reshape);
-    backward_weight_result_2d.CopyFrom(weights, copy_diff, reshape);
-  }
-  Blob<Dtype> result_nd;
-  Blob<Dtype> backward_result_nd;
-  Blob<Dtype> backward_weight_result_nd;
-  // Test with ND im2col
-  {
-    caffe_set(this->blob_top_->count(), Dtype(0),
-              this->blob_top_->mutable_cpu_data());
-    caffe_set(this->blob_bottom_->count(), Dtype(0),
-              this->blob_bottom_->mutable_cpu_diff());
-    caffe_set(weights.count(), Dtype(0), weights.mutable_cpu_diff());
-    // Do SetUp and Forward; save Forward result in result_nd.
-    convolution_param->set_force_nd_im2col(true);
-    ConvolutionLayer<Dtype> layer_nd(layer_param);
-    layer_nd.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-    ASSERT_EQ(1, layer_nd.blobs().size());
-    copy_diff = false; reshape = false;
-    layer_nd.blobs()[0]->CopyFrom(weights, copy_diff, reshape);
-    layer_nd.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-    copy_diff = false; reshape = true;
-    result_nd.CopyFrom(*this->blob_top_, copy_diff, reshape);
-    // Copy pre-generated top diff into actual top diff;
-    // do Backward and save result in backward_result_nd.
-    ASSERT_EQ(this->blob_top_->shape(), top_diff.shape());
-    caffe_copy(top_diff.count(), top_diff.cpu_data(),
-               this->blob_top_->mutable_cpu_diff());
-    layer_nd.Backward(this->blob_top_vec_, propagate_down,
-                      this->blob_bottom_vec_);
-    copy_diff = true; reshape = true;
-    backward_result_nd.CopyFrom(*this->blob_bottom_, copy_diff, reshape);
-    backward_weight_result_nd.CopyFrom(weights, copy_diff, reshape);
-  }
-  ASSERT_EQ(result_nd.count(), result_2d.count());
-  for (int i = 0; i < result_2d.count(); ++i)  {
-    EXPECT_EQ(result_2d.cpu_data()[i], result_nd.cpu_data()[i]);
-  }
-  ASSERT_EQ(backward_result_nd.count(), backward_result_2d.count());
-  for (int i = 0; i < backward_result_2d.count(); ++i) {
-    EXPECT_EQ(backward_result_2d.cpu_diff()[i],
-              backward_result_nd.cpu_diff()[i]);
-  }
-  ASSERT_EQ(backward_weight_result_nd.count(),
-            backward_weight_result_2d.count());
-  for (int i = 0; i < backward_weight_result_2d.count(); ++i) {
-    EXPECT_EQ(backward_weight_result_2d.cpu_diff()[i],
-              backward_weight_result_nd.cpu_diff()[i]);
-  }
-}
-
-TYPED_TEST(ConvolutionLayerTest, TestGradient) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  ConvolutionParameter* convolution_param =
-      layer_param.mutable_convolution_param();
-  this->blob_bottom_vec_.push_back(this->blob_bottom_2_);
-  this->blob_top_vec_.push_back(this->blob_top_2_);
-  convolution_param->add_kernel_size(3);
-  convolution_param->add_stride(2);
-  convolution_param->set_num_output(2);
-  convolution_param->mutable_weight_filler()->set_type("gaussian");
-  convolution_param->mutable_bias_filler()->set_type("gaussian");
-  ConvolutionLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-2, 1e-3);
-  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_);
-}
-
-TYPED_TEST(ConvolutionLayerTest, TestDilatedGradient) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  ConvolutionParameter* convolution_param =
-      layer_param.mutable_convolution_param();
-  vector<int> bottom_shape;
-  bottom_shape.push_back(2);
-  bottom_shape.push_back(3);
-  bottom_shape.push_back(5);
-  bottom_shape.push_back(6);
-  for (int i = 0; i < this->blob_bottom_vec_.size(); ++i) {
-    this->blob_bottom_vec_[i]->Reshape(bottom_shape);
-  }
-  convolution_param->add_kernel_size(3);
-  convolution_param->add_dilation(2);
-  convolution_param->set_num_output(2);
-  convolution_param->mutable_weight_filler()->set_type("gaussian");
-  convolution_param->mutable_bias_filler()->set_type("gaussian");
-  ConvolutionLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-2, 1e-3);
-  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-                                  this->blob_top_vec_);
-}
-
-TYPED_TEST(ConvolutionLayerTest, TestGradient3D) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  ConvolutionParameter* convolution_param =
-      layer_param.mutable_convolution_param();
-  vector<int> bottom_shape(5);
-  bottom_shape[0] = this->blob_bottom_vec_[0]->shape(0);
-  bottom_shape[1] = this->blob_bottom_vec_[0]->shape(1);
-  bottom_shape[2] = 5;
-  bottom_shape[3] = this->blob_bottom_vec_[0]->shape(2);
-  bottom_shape[4] = this->blob_bottom_vec_[0]->shape(3);
-  FillerParameter filler_param;
-  GaussianFiller<Dtype> filler(filler_param);
-  for (int i = 0; i < this->blob_bottom_vec_.size(); ++i) {
-    this->blob_bottom_vec_[i]->Reshape(bottom_shape);
-    filler.Fill(this->blob_bottom_vec_[i]);
-  }
-  convolution_param->add_kernel_size(3);
-  convolution_param->add_stride(2);
-  convolution_param->set_num_output(2);
-  convolution_param->mutable_weight_filler()->set_type("gaussian");
-  convolution_param->mutable_bias_filler()->set_type("gaussian");
-  ConvolutionLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-2, 1e-3);
-  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_);
-}
-
-TYPED_TEST(ConvolutionLayerTest, Test1x1Gradient) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  ConvolutionParameter* convolution_param =
-      layer_param.mutable_convolution_param();
-  this->blob_bottom_vec_.push_back(this->blob_bottom_2_);
-  this->blob_top_vec_.push_back(this->blob_top_2_);
-  convolution_param->add_kernel_size(1);
-  convolution_param->add_stride(1);
-  convolution_param->set_num_output(2);
-  convolution_param->mutable_weight_filler()->set_type("gaussian");
-  convolution_param->mutable_bias_filler()->set_type("gaussian");
-  ConvolutionLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-2, 1e-3);
-  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_);
-}
-
-TYPED_TEST(ConvolutionLayerTest, TestGradientGroup) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  ConvolutionParameter* convolution_param =
-      layer_param.mutable_convolution_param();
-  convolution_param->add_kernel_size(3);
-  convolution_param->add_stride(2);
-  convolution_param->set_num_output(3);
-  convolution_param->set_group(3);
-  convolution_param->mutable_weight_filler()->set_type("gaussian");
-  convolution_param->mutable_bias_filler()->set_type("gaussian");
-  ConvolutionLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-2, 1e-3);
-  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_);
-}
-
-#ifdef USE_CUDNN
-
-template <typename Dtype>
-class CuDNNConvolutionLayerTest : public GPUDeviceTest<Dtype> {
- protected:
-  CuDNNConvolutionLayerTest()
-      : blob_bottom_(new Blob<Dtype>(2, 3, 6, 4)),
-        blob_bottom_2_(new Blob<Dtype>(2, 3, 6, 4)),
-        blob_top_(new Blob<Dtype>()),
-        blob_top_2_(new Blob<Dtype>()) {}
-  virtual void SetUp() {
-    // fill the values
-    FillerParameter filler_param;
-    filler_param.set_value(1.);
-    GaussianFiller<Dtype> filler(filler_param);
-    filler.Fill(this->blob_bottom_);
-    filler.Fill(this->blob_bottom_2_);
-    blob_bottom_vec_.push_back(blob_bottom_);
-    blob_top_vec_.push_back(blob_top_);
-  }
-
-  virtual ~CuDNNConvolutionLayerTest() {
-    delete blob_bottom_;
-    delete blob_bottom_2_;
-    delete blob_top_;
-    delete blob_top_2_;
-  }
-
-  virtual Blob<Dtype>* MakeReferenceTop(Blob<Dtype>* top) {
-    this->ref_blob_top_.reset(new Blob<Dtype>());
-    this->ref_blob_top_->ReshapeLike(*top);
-    return this->ref_blob_top_.get();
-  }
-
-  Blob<Dtype>* const blob_bottom_;
-  Blob<Dtype>* const blob_bottom_2_;
-  Blob<Dtype>* const blob_top_;
-  Blob<Dtype>* const blob_top_2_;
-  shared_ptr<Blob<Dtype> > ref_blob_top_;
-  vector<Blob<Dtype>*> blob_bottom_vec_;
-  vector<Blob<Dtype>*> blob_top_vec_;
-};
-
-TYPED_TEST_CASE(CuDNNConvolutionLayerTest, TestDtypes);
-
-TYPED_TEST(CuDNNConvolutionLayerTest, TestSetupCuDNN) {
-  this->blob_bottom_vec_.push_back(this->blob_bottom_2_);
-  this->blob_top_vec_.push_back(this->blob_top_2_);
-  LayerParameter layer_param;
-  ConvolutionParameter* convolution_param =
-      layer_param.mutable_convolution_param();
-  convolution_param->add_kernel_size(3);
-  convolution_param->add_stride(2);
-  convolution_param->set_num_output(4);
-  this->blob_bottom_vec_.push_back(this->blob_bottom_2_);
-  this->blob_top_vec_.push_back(this->blob_top_2_);
-  shared_ptr<Layer<TypeParam> > layer(
-      new CuDNNConvolutionLayer<TypeParam>(layer_param));
-  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  EXPECT_EQ(this->blob_top_->num(), 2);
-  EXPECT_EQ(this->blob_top_->channels(), 4);
-  EXPECT_EQ(this->blob_top_->height(), 2);
-  EXPECT_EQ(this->blob_top_->width(), 1);
-  EXPECT_EQ(this->blob_top_2_->num(), 2);
-  EXPECT_EQ(this->blob_top_2_->channels(), 4);
-  EXPECT_EQ(this->blob_top_2_->height(), 2);
-  EXPECT_EQ(this->blob_top_2_->width(), 1);
-  // setting group should not change the shape
-  convolution_param->set_num_output(3);
-  convolution_param->set_group(3);
-  layer.reset(new CuDNNConvolutionLayer<TypeParam>(layer_param));
-  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  EXPECT_EQ(this->blob_top_->num(), 2);
-  EXPECT_EQ(this->blob_top_->channels(), 3);
-  EXPECT_EQ(this->blob_top_->height(), 2);
-  EXPECT_EQ(this->blob_top_->width(), 1);
-  EXPECT_EQ(this->blob_top_2_->num(), 2);
-  EXPECT_EQ(this->blob_top_2_->channels(), 3);
-  EXPECT_EQ(this->blob_top_2_->height(), 2);
-  EXPECT_EQ(this->blob_top_2_->width(), 1);
-}
-
-TYPED_TEST(CuDNNConvolutionLayerTest, TestSimpleConvolutionCuDNN) {
-  this->blob_bottom_vec_.push_back(this->blob_bottom_2_);
-  this->blob_top_vec_.push_back(this->blob_top_2_);
-  LayerParameter layer_param;
-  ConvolutionParameter* convolution_param =
-      layer_param.mutable_convolution_param();
-  convolution_param->add_kernel_size(3);
-  convolution_param->add_stride(2);
-  convolution_param->set_num_output(4);
-  convolution_param->mutable_weight_filler()->set_type("gaussian");
-  convolution_param->mutable_bias_filler()->set_type("constant");
-  convolution_param->mutable_bias_filler()->set_value(0.1);
-  shared_ptr<Layer<TypeParam> > layer(
-      new CuDNNConvolutionLayer<TypeParam>(layer_param));
-  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  // Check against reference convolution.
-  const TypeParam* top_data;
-  const TypeParam* ref_top_data;
-  caffe_conv(this->blob_bottom_, convolution_param, layer->blobs(),
-      this->MakeReferenceTop(this->blob_top_));
-  top_data = this->blob_top_->cpu_data();
-  ref_top_data = this->ref_blob_top_->cpu_data();
-  for (int i = 0; i < this->blob_top_->count(); ++i) {
-    EXPECT_NEAR(top_data[i], ref_top_data[i], 1e-4);
-  }
-  caffe_conv(this->blob_bottom_2_, convolution_param, layer->blobs(),
-      this->MakeReferenceTop(this->blob_top_2_));
-  top_data = this->blob_top_2_->cpu_data();
-  ref_top_data = this->ref_blob_top_->cpu_data();
-  for (int i = 0; i < this->blob_top_->count(); ++i) {
-    EXPECT_NEAR(top_data[i], ref_top_data[i], 1e-4);
-  }
-}
-
-TYPED_TEST(CuDNNConvolutionLayerTest, TestSimpleConvolutionGroupCuDNN) {
-  LayerParameter layer_param;
-  ConvolutionParameter* convolution_param =
-      layer_param.mutable_convolution_param();
-  convolution_param->add_kernel_size(3);
-  convolution_param->add_stride(2);
-  convolution_param->set_num_output(3);
-  convolution_param->set_group(3);
-  convolution_param->mutable_weight_filler()->set_type("gaussian");
-  convolution_param->mutable_bias_filler()->set_type("constant");
-  convolution_param->mutable_bias_filler()->set_value(0.1);
-  shared_ptr<Layer<TypeParam> > layer(
-      new CuDNNConvolutionLayer<TypeParam>(layer_param));
-  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  // Check against reference convolution.
-  const TypeParam* top_data;
-  const TypeParam* ref_top_data;
-  caffe_conv(this->blob_bottom_, convolution_param, layer->blobs(),
-      this->MakeReferenceTop(this->blob_top_));
-  top_data = this->blob_top_->cpu_data();
-  ref_top_data = this->ref_blob_top_->cpu_data();
-  for (int i = 0; i < this->blob_top_->count(); ++i) {
-    EXPECT_NEAR(top_data[i], ref_top_data[i], 1e-4);
-  }
-}
-
-TYPED_TEST(CuDNNConvolutionLayerTest, TestSobelConvolutionCuDNN) {
-  // Test separable convolution by computing the Sobel operator
-  // as a single filter then comparing the result
-  // as the convolution of two rectangular filters.
-
-  // Fill bottoms with identical Gaussian noise.
-  shared_ptr<GaussianFiller<TypeParam> > filler;
-  FillerParameter filler_param;
-  filler_param.set_value(1.);
-  filler.reset(new GaussianFiller<TypeParam>(filler_param));
-  filler->Fill(this->blob_bottom_);
-  this->blob_bottom_2_->CopyFrom(*this->blob_bottom_);
-  // Compute Sobel G_x operator as 3 x 3 convolution.
-  LayerParameter layer_param;
-  ConvolutionParameter* convolution_param =
-      layer_param.mutable_convolution_param();
-  convolution_param->add_kernel_size(3);
-  convolution_param->add_stride(2);
-  convolution_param->set_num_output(1);
-  convolution_param->set_bias_term(false);
-  shared_ptr<Layer<TypeParam> > layer(
-      new CuDNNConvolutionLayer<TypeParam>(layer_param));
-  layer->blobs().resize(1);
-  layer->blobs()[0].reset(new Blob<TypeParam>(1, 3, 3, 3));
-  TypeParam* weights = layer->blobs()[0]->mutable_cpu_data();
-  for (int c = 0; c < 3; ++c) {
-    int i = c * 9;  // 3 x 3 filter
-    weights[i +  0] = -1;
-    weights[i +  1] =  0;
-    weights[i +  2] =  1;
-    weights[i +  3] = -2;
-    weights[i +  4] =  0;
-    weights[i +  5] =  2;
-    weights[i +  6] = -1;
-    weights[i +  7] =  0;
-    weights[i +  8] =  1;
-  }
-  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  // Compute Sobel G_x operator as separable 3 x 1 and 1 x 3 convolutions.
-  // (1) the [1 2 1] column filter
-  vector<Blob<TypeParam>*> sep_blob_bottom_vec;
-  vector<Blob<TypeParam>*> sep_blob_top_vec;
-  shared_ptr<Blob<TypeParam> > blob_sep(new Blob<TypeParam>());
-  sep_blob_bottom_vec.push_back(this->blob_bottom_2_);
-  sep_blob_top_vec.push_back(this->blob_top_2_);
-  convolution_param->clear_kernel_size();
-  convolution_param->clear_stride();
-  convolution_param->set_kernel_h(3);
-  convolution_param->set_kernel_w(1);
-  convolution_param->set_stride_h(2);
-  convolution_param->set_stride_w(1);
-  convolution_param->set_num_output(1);
-  convolution_param->set_bias_term(false);
-  layer.reset(new CuDNNConvolutionLayer<TypeParam>(layer_param));
-  layer->blobs().resize(1);
-  layer->blobs()[0].reset(new Blob<TypeParam>(1, 3, 3, 1));
-  TypeParam* weights_1 = layer->blobs()[0]->mutable_cpu_data();
-  for (int c = 0; c < 3; ++c) {
-    int i = c * 3;  // 3 x 1 filter
-    weights_1[i +  0] = 1;
-    weights_1[i +  1] = 2;
-    weights_1[i +  2] = 1;
-  }
-  layer->SetUp(sep_blob_bottom_vec, sep_blob_top_vec);
-  layer->Forward(sep_blob_bottom_vec, sep_blob_top_vec);
-  // (2) the [-1 0 1] row filter
-  blob_sep->CopyFrom(*this->blob_top_2_, false, true);
-  sep_blob_bottom_vec.clear();
-  sep_blob_bottom_vec.push_back(blob_sep.get());
-  convolution_param->set_kernel_h(1);
-  convolution_param->set_kernel_w(3);
-  convolution_param->set_stride_h(1);
-  convolution_param->set_stride_w(2);
-  convolution_param->set_num_output(1);
-  convolution_param->set_bias_term(false);
-  layer.reset(new CuDNNConvolutionLayer<TypeParam>(layer_param));
-  layer->blobs().resize(1);
-  layer->blobs()[0].reset(new Blob<TypeParam>(1, 1, 1, 3));
-  TypeParam* weights_2 = layer->blobs()[0]->mutable_cpu_data();
-  weights_2[0] = -1;
-  weights_2[1] =  0;
-  weights_2[2] =  1;
-  layer->SetUp(sep_blob_bottom_vec, sep_blob_top_vec);
-  layer->Forward(sep_blob_bottom_vec, sep_blob_top_vec);
-  // Test equivalence of full and separable filters.
-  const TypeParam* top_data = this->blob_top_->cpu_data();
-  const TypeParam* sep_top_data = this->blob_top_2_->cpu_data();
-  for (int i = 0; i < this->blob_top_->count(); ++i) {
-    EXPECT_NEAR(top_data[i], sep_top_data[i], 1e-4);
-  }
-}
-
-TYPED_TEST(CuDNNConvolutionLayerTest, TestGradientCuDNN) {
-  LayerParameter layer_param;
-  ConvolutionParameter* convolution_param =
-      layer_param.mutable_convolution_param();
-  this->blob_bottom_vec_.push_back(this->blob_bottom_2_);
-  this->blob_top_vec_.push_back(this->blob_top_2_);
-  convolution_param->add_kernel_size(3);
-  convolution_param->add_stride(2);
-  convolution_param->set_num_output(2);
-  convolution_param->mutable_weight_filler()->set_type("gaussian");
-  convolution_param->mutable_bias_filler()->set_type("gaussian");
-  CuDNNConvolutionLayer<TypeParam> layer(layer_param);
-  GradientChecker<TypeParam> checker(1e-2, 1e-3);
-  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_);
-}
-
-TYPED_TEST(CuDNNConvolutionLayerTest, TestGradientGroupCuDNN) {
-  LayerParameter layer_param;
-  ConvolutionParameter* convolution_param =
-      layer_param.mutable_convolution_param();
-  convolution_param->add_kernel_size(3);
-  convolution_param->add_stride(2);
-  convolution_param->set_num_output(3);
-  convolution_param->set_group(3);
-  convolution_param->mutable_weight_filler()->set_type("gaussian");
-  convolution_param->mutable_bias_filler()->set_type("gaussian");
-  CuDNNConvolutionLayer<TypeParam> layer(layer_param);
-  GradientChecker<TypeParam> checker(1e-2, 1e-3);
-  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_);
-}
-
-#endif
-
-}  // namespace caffe
diff --git a/src/caffe/test/test_crop_layer.cpp b/src/caffe/test/test_crop_layer.cpp
deleted file mode 100644
index ce2c736..0000000
--- a/src/caffe/test/test_crop_layer.cpp
+++ /dev/null
@@ -1,283 +0,0 @@
-#include <vector>
-
-#include "gtest/gtest.h"
-
-#include "caffe/blob.hpp"
-#include "caffe/common.hpp"
-#include "caffe/filler.hpp"
-#include "caffe/layers/crop_layer.hpp"
-
-#include "caffe/test/test_caffe_main.hpp"
-#include "caffe/test/test_gradient_check_util.hpp"
-
-namespace caffe {
-
-template <typename TypeParam>
-class CropLayerTest : public MultiDeviceTest<TypeParam> {
-  typedef typename TypeParam::Dtype Dtype;
-
- protected:
-  CropLayerTest()
-      : blob_bottom_0_(new Blob<Dtype>(2, 4, 5, 4)),
-        blob_bottom_1_(new Blob<Dtype>(2, 3, 4, 2)),
-        blob_top_(new Blob<Dtype>()) {}
-  virtual void SetUp() {
-    // fill the values
-    FillerParameter filler_param;
-    GaussianFiller<Dtype> filler(filler_param);
-    filler.Fill(this->blob_bottom_0_);
-    filler.Fill(this->blob_bottom_1_);
-
-    blob_bottom_vec_.push_back(blob_bottom_0_);
-    blob_bottom_vec_.push_back(blob_bottom_1_);
-    blob_top_vec_.push_back(blob_top_);
-  }
-
-  virtual ~CropLayerTest() {
-    delete blob_bottom_0_; delete blob_bottom_1_;
-    delete blob_top_;
-  }
-
-  Blob<Dtype>* const blob_bottom_0_;
-  Blob<Dtype>* const blob_bottom_1_;
-  Blob<Dtype>* const blob_top_;
-  vector<Blob<Dtype>*> blob_bottom_vec_;
-  vector<Blob<Dtype>*> blob_top_vec_;
-};
-
-
-TYPED_TEST_CASE(CropLayerTest, TestDtypesAndDevices);
-
-TYPED_TEST(CropLayerTest, TestSetupShapeAll) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  // Crop all dimensions
-  layer_param.mutable_crop_param()->set_axis(0);
-  CropLayer<Dtype> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  for (int i = 0; i < this->blob_top_->num_axes(); ++i) {
-    EXPECT_EQ(this->blob_bottom_1_->shape(i), this->blob_top_->shape(i));
-  }
-}
-
-TYPED_TEST(CropLayerTest, TestSetupShapeDefault) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  // Crop last two dimensions, axis is 2 by default
-  CropLayer<Dtype> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  for (int i = 0; i < this->blob_top_->num_axes(); ++i) {
-    if (i < 2) {
-      EXPECT_EQ(this->blob_bottom_0_->shape(i), this->blob_top_->shape(i));
-    } else {
-      EXPECT_EQ(this->blob_bottom_1_->shape(i), this->blob_top_->shape(i));
-    }
-  }
-}
-
-TYPED_TEST(CropLayerTest, TestSetupShapeNegativeIndexing) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  // Crop last dimension by negative indexing
-  layer_param.mutable_crop_param()->set_axis(-1);
-  CropLayer<Dtype> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  for (int i = 0; i < this->blob_top_->num_axes(); ++i) {
-    if (i < 3) {
-      EXPECT_EQ(this->blob_bottom_0_->shape(i), this->blob_top_->shape(i));
-    } else {
-      EXPECT_EQ(this->blob_bottom_1_->shape(i), this->blob_top_->shape(i));
-    }
-  }
-}
-
-TYPED_TEST(CropLayerTest, TestDimensionsCheck) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  // Reshape size blob to have incompatible sizes for uncropped dimensions:
-  // the size blob has more channels than the data blob, but this is fine
-  // since the channels dimension is not cropped in this configuration.
-  this->blob_bottom_1_->Reshape(2, 5, 4, 2);
-  CropLayer<Dtype> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  for (int i = 0; i < this->blob_top_->num_axes(); ++i) {
-    if (i < 2) {
-      EXPECT_EQ(this->blob_bottom_0_->shape(i), this->blob_top_->shape(i));
-    } else {
-      EXPECT_EQ(this->blob_bottom_1_->shape(i), this->blob_top_->shape(i));
-    }
-  }
-}
-
-TYPED_TEST(CropLayerTest, TestCropAll) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  layer_param.mutable_crop_param()->set_axis(0);
-  CropLayer<Dtype> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  for (int n = 0; n < this->blob_bottom_0_->num(); ++n) {
-    for (int c = 0; c < this->blob_bottom_0_->channels(); ++c) {
-      for (int h = 0; h < this->blob_bottom_0_->height(); ++h) {
-        for (int w = 0; w < this->blob_bottom_0_->width(); ++w) {
-          if ( n < this->blob_top_->shape(0) &&
-              c < this->blob_top_->shape(1) &&
-              h < this->blob_top_->shape(2) &&
-              w < this->blob_top_->shape(3) ) {
-            EXPECT_EQ(this->blob_top_->data_at(n, c, h, w),
-                this->blob_bottom_0_->data_at(n, c, h, w));
-          }
-        }
-      }
-    }
-  }
-}
-
-TYPED_TEST(CropLayerTest, TestCropAllOffset) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  layer_param.mutable_crop_param()->set_axis(0);
-  layer_param.mutable_crop_param()->add_offset(0);
-  layer_param.mutable_crop_param()->add_offset(1);
-  layer_param.mutable_crop_param()->add_offset(1);
-  layer_param.mutable_crop_param()->add_offset(2);
-  CropLayer<Dtype> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  for (int n = 0; n < this->blob_bottom_0_->num(); ++n) {
-    for (int c = 0; c < this->blob_bottom_0_->channels(); ++c) {
-      for (int h = 0; h < this->blob_bottom_0_->height(); ++h) {
-        for (int w = 0; w < this->blob_bottom_0_->width(); ++w) {
-          if ( n < this->blob_top_->shape(0) &&
-              c < this->blob_top_->shape(1) &&
-              h < this->blob_top_->shape(2) &&
-              w < this->blob_top_->shape(3) ) {
-            EXPECT_EQ(this->blob_top_->data_at(n, c, h, w),
-                this->blob_bottom_0_->data_at(n, c+1, h+1, w+2));
-          }
-        }
-      }
-    }
-  }
-}
-
-TYPED_TEST(CropLayerTest, TestCropHW) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  layer_param.mutable_crop_param()->set_axis(2);
-  layer_param.mutable_crop_param()->add_offset(1);
-  layer_param.mutable_crop_param()->add_offset(2);
-  CropLayer<Dtype> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  for (int n = 0; n < this->blob_bottom_0_->num(); ++n) {
-    for (int c = 0; c < this->blob_bottom_0_->channels(); ++c) {
-      for (int h = 0; h < this->blob_bottom_0_->height(); ++h) {
-        for (int w = 0; w < this->blob_bottom_0_->width(); ++w) {
-          if (n < this->blob_top_->shape(0) &&
-              c < this->blob_top_->shape(1) &&
-              h < this->blob_top_->shape(2) &&
-              w < this->blob_top_->shape(3)) {
-            EXPECT_EQ(this->blob_top_->data_at(n, c, h, w),
-                this->blob_bottom_0_->data_at(n, c, h+1, w+2));
-          }
-        }
-      }
-    }
-  }
-}
-
-TYPED_TEST(CropLayerTest, TestCrop5D) {
-  typedef typename TypeParam::Dtype Dtype;
-  // Add dimension to each bottom for >4D check
-  vector<int> bottom_0_shape = this->blob_bottom_0_->shape();
-  vector<int> bottom_1_shape = this->blob_bottom_1_->shape();
-  bottom_0_shape.push_back(2);
-  bottom_1_shape.push_back(1);
-  this->blob_bottom_0_->Reshape(bottom_0_shape);
-  this->blob_bottom_1_->Reshape(bottom_1_shape);
-  FillerParameter filler_param;
-  GaussianFiller<Dtype> filler(filler_param);
-  filler.Fill(this->blob_bottom_0_);
-  filler.Fill(this->blob_bottom_1_);
-  // Make layer
-  LayerParameter layer_param;
-  layer_param.mutable_crop_param()->set_axis(2);
-  layer_param.mutable_crop_param()->add_offset(1);
-  layer_param.mutable_crop_param()->add_offset(2);
-  layer_param.mutable_crop_param()->add_offset(0);
-  CropLayer<Dtype> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  vector<int> bottom_idx = vector<int>(5, 0);
-  vector<int> top_idx = vector<int>(5, 0);
-  for (int n = 0; n < this->blob_bottom_0_->shape(0); ++n) {
-    for (int c = 0; c < this->blob_bottom_0_->shape(1); ++c) {
-      for (int z = 0; z < this->blob_bottom_0_->shape(2); ++z) {
-        for (int h = 0; h < this->blob_bottom_0_->shape(3); ++h) {
-          for (int w = 0; w < this->blob_bottom_0_->shape(4); ++w) {
-            if (n < this->blob_top_->shape(0) &&
-                c < this->blob_top_->shape(1) &&
-                z < this->blob_top_->shape(2) &&
-                h < this->blob_top_->shape(3) &&
-                w < this->blob_top_->shape(4)) {
-              bottom_idx[0] = top_idx[0] = n;
-              bottom_idx[1] = top_idx[1] = c;
-              bottom_idx[2] = z;
-              bottom_idx[3] = h;
-              bottom_idx[4] = top_idx[4] = w;
-              top_idx[2] = z+1;
-              top_idx[3] = h+2;
-              EXPECT_EQ(this->blob_top_->data_at(bottom_idx),
-                  this->blob_bottom_0_->data_at(top_idx));
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-TYPED_TEST(CropLayerTest, TestCropAllGradient) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  layer_param.mutable_crop_param()->set_axis(0);
-  CropLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-2, 1e-3);
-  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_);
-}
-
-TYPED_TEST(CropLayerTest, TestCropHWGradient) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  layer_param.mutable_crop_param()->set_axis(2);
-  layer_param.mutable_crop_param()->add_offset(1);
-  layer_param.mutable_crop_param()->add_offset(2);
-  CropLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-2, 1e-3);
-  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_);
-}
-
-TYPED_TEST(CropLayerTest, TestCrop5DGradient) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  layer_param.mutable_crop_param()->set_axis(2);
-  layer_param.mutable_crop_param()->add_offset(1);
-  layer_param.mutable_crop_param()->add_offset(2);
-  layer_param.mutable_crop_param()->add_offset(0);
-  CropLayer<Dtype> layer(layer_param);
-  // Add dimension to each bottom for >4D check
-  vector<int> bottom_0_shape = this->blob_bottom_0_->shape();
-  vector<int> bottom_1_shape = this->blob_bottom_1_->shape();
-  bottom_0_shape.push_back(2);
-  bottom_1_shape.push_back(1);
-  this->blob_bottom_0_->Reshape(bottom_0_shape);
-  this->blob_bottom_1_->Reshape(bottom_1_shape);
-  GradientChecker<Dtype> checker(1e-2, 1e-3);
-  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_);
-}
-
-}  // namespace caffe
diff --git a/src/caffe/test/test_data/generate_sample_data.py b/src/caffe/test/test_data/generate_sample_data.py
deleted file mode 100644
index 2645073..0000000
--- a/src/caffe/test/test_data/generate_sample_data.py
+++ /dev/null
@@ -1,81 +0,0 @@
-"""
-Generate data used in the HDF5DataLayer and GradientBasedSolver tests.
-"""
-import os
-import numpy as np
-import h5py
-
-script_dir = os.path.dirname(os.path.abspath(__file__))
-
-# Generate HDF5DataLayer sample_data.h5
-
-num_cols = 8
-num_rows = 10
-height = 6
-width = 5
-total_size = num_cols * num_rows * height * width
-
-data = np.arange(total_size)
-data = data.reshape(num_rows, num_cols, height, width)
-data = data.astype('float32')
-
-# We had a bug where data was copied into label, but the tests weren't
-# catching it, so let's make label 1-indexed.
-label = 1 + np.arange(num_rows)[:, np.newaxis]
-label = label.astype('float32')
-
-# We add an extra label2 dataset to test HDF5 layer's ability
-# to handle arbitrary number of output ("top") Blobs.
-label2 = label + 1
-
-print data
-print label
-
-with h5py.File(script_dir + '/sample_data.h5', 'w') as f:
-    f['data'] = data
-    f['label'] = label
-    f['label2'] = label2
-
-with h5py.File(script_dir + '/sample_data_2_gzip.h5', 'w') as f:
-    f.create_dataset(
-        'data', data=data + total_size,
-        compression='gzip', compression_opts=1
-    )
-    f.create_dataset(
-        'label', data=label,
-        compression='gzip', compression_opts=1,
-        dtype='uint8',
-    )
-    f.create_dataset(
-        'label2', data=label2,
-        compression='gzip', compression_opts=1,
-        dtype='uint8',
-    )
-
-with open(script_dir + '/sample_data_list.txt', 'w') as f:
-    f.write('src/caffe/test/test_data/sample_data.h5\n')
-    f.write('src/caffe/test/test_data/sample_data_2_gzip.h5\n')
-
-# Generate GradientBasedSolver solver_data.h5
-
-num_cols = 3
-num_rows = 8
-height = 10
-width = 10
-
-data = np.random.randn(num_rows, num_cols, height, width)
-data = data.reshape(num_rows, num_cols, height, width)
-data = data.astype('float32')
-
-targets = np.random.randn(num_rows, 1)
-targets = targets.astype('float32')
-
-print data
-print targets
-
-with h5py.File(script_dir + '/solver_data.h5', 'w') as f:
-    f['data'] = data
-    f['targets'] = targets
-
-with open(script_dir + '/solver_data_list.txt', 'w') as f:
-    f.write('src/caffe/test/test_data/solver_data.h5\n')
diff --git a/src/caffe/test/test_data/sample_data.h5 b/src/caffe/test/test_data/sample_data.h5
deleted file mode 100644
index 236e66b..0000000
Binary files a/src/caffe/test/test_data/sample_data.h5 and /dev/null differ
diff --git a/src/caffe/test/test_data/sample_data_2_gzip.h5 b/src/caffe/test/test_data/sample_data_2_gzip.h5
deleted file mode 100644
index 0cb9ef9..0000000
Binary files a/src/caffe/test/test_data/sample_data_2_gzip.h5 and /dev/null differ
diff --git a/src/caffe/test/test_data/sample_data_list.txt b/src/caffe/test/test_data/sample_data_list.txt
deleted file mode 100644
index cdf343f..0000000
--- a/src/caffe/test/test_data/sample_data_list.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-src/caffe/test/test_data/sample_data.h5
-src/caffe/test/test_data/sample_data_2_gzip.h5
diff --git a/src/caffe/test/test_data/solver_data.h5 b/src/caffe/test/test_data/solver_data.h5
deleted file mode 100644
index 7ee05ea..0000000
Binary files a/src/caffe/test/test_data/solver_data.h5 and /dev/null differ
diff --git a/src/caffe/test/test_data/solver_data_list.txt b/src/caffe/test/test_data/solver_data_list.txt
deleted file mode 100644
index a6552f5..0000000
--- a/src/caffe/test/test_data/solver_data_list.txt
+++ /dev/null
@@ -1 +0,0 @@
-src/caffe/test/test_data/solver_data.h5
diff --git a/src/caffe/test/test_data_layer.cpp b/src/caffe/test/test_data_layer.cpp
deleted file mode 100644
index 3e8d113..0000000
--- a/src/caffe/test/test_data_layer.cpp
+++ /dev/null
@@ -1,433 +0,0 @@
-#ifdef USE_OPENCV
-#include <string>
-#include <vector>
-
-#include "boost/scoped_ptr.hpp"
-#include "gtest/gtest.h"
-
-#include "caffe/blob.hpp"
-#include "caffe/common.hpp"
-#include "caffe/filler.hpp"
-#include "caffe/layers/data_layer.hpp"
-#include "caffe/proto/caffe.pb.h"
-#include "caffe/util/db.hpp"
-#include "caffe/util/io.hpp"
-
-#include "caffe/test/test_caffe_main.hpp"
-
-namespace caffe {
-
-using boost::scoped_ptr;
-
-template <typename TypeParam>
-class DataLayerTest : public MultiDeviceTest<TypeParam> {
-  typedef typename TypeParam::Dtype Dtype;
-
- protected:
-  DataLayerTest()
-      : backend_(DataParameter_DB_LEVELDB),
-        blob_top_data_(new Blob<Dtype>()),
-        blob_top_label_(new Blob<Dtype>()),
-        seed_(1701) {}
-  virtual void SetUp() {
-    filename_.reset(new string());
-    MakeTempDir(filename_.get());
-    *filename_ += "/db";
-    blob_top_vec_.push_back(blob_top_data_);
-    blob_top_vec_.push_back(blob_top_label_);
-  }
-
-  // Fill the DB with data: if unique_pixels, each pixel is unique but
-  // all images are the same; else each image is unique but all pixels within
-  // an image are the same.
-  void Fill(const bool unique_pixels, DataParameter_DB backend) {
-    backend_ = backend;
-    LOG(INFO) << "Using temporary dataset " << *filename_;
-    scoped_ptr<db::DB> db(db::GetDB(backend));
-    db->Open(*filename_, db::NEW);
-    scoped_ptr<db::Transaction> txn(db->NewTransaction());
-    for (int i = 0; i < 5; ++i) {
-      Datum datum;
-      datum.set_label(i);
-      datum.set_channels(2);
-      datum.set_height(3);
-      datum.set_width(4);
-      std::string* data = datum.mutable_data();
-      for (int j = 0; j < 24; ++j) {
-        int datum = unique_pixels ? j : i;
-        data->push_back(static_cast<uint8_t>(datum));
-      }
-      stringstream ss;
-      ss << i;
-      string out;
-      CHECK(datum.SerializeToString(&out));
-      txn->Put(ss.str(), out);
-    }
-    txn->Commit();
-    db->Close();
-  }
-
-  void TestRead() {
-    const Dtype scale = 3;
-    LayerParameter param;
-    param.set_phase(TRAIN);
-    DataParameter* data_param = param.mutable_data_param();
-    data_param->set_batch_size(5);
-    data_param->set_source(filename_->c_str());
-    data_param->set_backend(backend_);
-
-    TransformationParameter* transform_param =
-        param.mutable_transform_param();
-    transform_param->set_scale(scale);
-
-    DataLayer<Dtype> layer(param);
-    layer.SetUp(blob_bottom_vec_, blob_top_vec_);
-    EXPECT_EQ(blob_top_data_->num(), 5);
-    EXPECT_EQ(blob_top_data_->channels(), 2);
-    EXPECT_EQ(blob_top_data_->height(), 3);
-    EXPECT_EQ(blob_top_data_->width(), 4);
-    EXPECT_EQ(blob_top_label_->num(), 5);
-    EXPECT_EQ(blob_top_label_->channels(), 1);
-    EXPECT_EQ(blob_top_label_->height(), 1);
-    EXPECT_EQ(blob_top_label_->width(), 1);
-
-    for (int iter = 0; iter < 100; ++iter) {
-      layer.Forward(blob_bottom_vec_, blob_top_vec_);
-      for (int i = 0; i < 5; ++i) {
-        EXPECT_EQ(i, blob_top_label_->cpu_data()[i]);
-      }
-      for (int i = 0; i < 5; ++i) {
-        for (int j = 0; j < 24; ++j) {
-          EXPECT_EQ(scale * i, blob_top_data_->cpu_data()[i * 24 + j])
-              << "debug: iter " << iter << " i " << i << " j " << j;
-        }
-      }
-    }
-  }
-
-  void TestReshape(DataParameter_DB backend) {
-    const int num_inputs = 5;
-    // Save data of varying shapes.
-    LOG(INFO) << "Using temporary dataset " << *filename_;
-    scoped_ptr<db::DB> db(db::GetDB(backend));
-    db->Open(*filename_, db::NEW);
-    scoped_ptr<db::Transaction> txn(db->NewTransaction());
-    for (int i = 0; i < num_inputs; ++i) {
-      Datum datum;
-      datum.set_label(i);
-      datum.set_channels(2);
-      datum.set_height(i % 2 + 1);
-      datum.set_width(i % 4 + 1);
-      std::string* data = datum.mutable_data();
-      const int data_size = datum.channels() * datum.height() * datum.width();
-      for (int j = 0; j < data_size; ++j) {
-        data->push_back(static_cast<uint8_t>(j));
-      }
-      stringstream ss;
-      ss << i;
-      string out;
-      CHECK(datum.SerializeToString(&out));
-      txn->Put(ss.str(), out);
-    }
-    txn->Commit();
-    db->Close();
-
-    // Load and check data of various shapes.
-    LayerParameter param;
-    param.set_phase(TEST);
-    DataParameter* data_param = param.mutable_data_param();
-    data_param->set_batch_size(1);
-    data_param->set_source(filename_->c_str());
-    data_param->set_backend(backend);
-
-    DataLayer<Dtype> layer(param);
-    layer.SetUp(blob_bottom_vec_, blob_top_vec_);
-    EXPECT_EQ(blob_top_data_->num(), 1);
-    EXPECT_EQ(blob_top_data_->channels(), 2);
-    EXPECT_EQ(blob_top_label_->num(), 1);
-    EXPECT_EQ(blob_top_label_->channels(), 1);
-    EXPECT_EQ(blob_top_label_->height(), 1);
-    EXPECT_EQ(blob_top_label_->width(), 1);
-
-    for (int iter = 0; iter < num_inputs; ++iter) {
-      layer.Forward(blob_bottom_vec_, blob_top_vec_);
-      EXPECT_EQ(blob_top_data_->height(), iter % 2 + 1);
-      EXPECT_EQ(blob_top_data_->width(), iter % 4 + 1);
-      EXPECT_EQ(iter, blob_top_label_->cpu_data()[0]);
-      const int channels = blob_top_data_->channels();
-      const int height = blob_top_data_->height();
-      const int width = blob_top_data_->width();
-      for (int c = 0; c < channels; ++c) {
-        for (int h = 0; h < height; ++h) {
-          for (int w = 0; w < width; ++w) {
-            const int idx = (c * height + h) * width + w;
-            EXPECT_EQ(idx, static_cast<int>(blob_top_data_->cpu_data()[idx]))
-                << "debug: iter " << iter << " c " << c
-                << " h " << h << " w " << w;
-          }
-        }
-      }
-    }
-  }
-
-  void TestReadCrop(Phase phase) {
-    const Dtype scale = 3;
-    LayerParameter param;
-    param.set_phase(phase);
-    Caffe::set_random_seed(1701);
-
-    DataParameter* data_param = param.mutable_data_param();
-    data_param->set_batch_size(5);
-    data_param->set_source(filename_->c_str());
-    data_param->set_backend(backend_);
-
-    TransformationParameter* transform_param =
-        param.mutable_transform_param();
-    transform_param->set_scale(scale);
-    transform_param->set_crop_size(1);
-
-    DataLayer<Dtype> layer(param);
-    layer.SetUp(blob_bottom_vec_, blob_top_vec_);
-    EXPECT_EQ(blob_top_data_->num(), 5);
-    EXPECT_EQ(blob_top_data_->channels(), 2);
-    EXPECT_EQ(blob_top_data_->height(), 1);
-    EXPECT_EQ(blob_top_data_->width(), 1);
-    EXPECT_EQ(blob_top_label_->num(), 5);
-    EXPECT_EQ(blob_top_label_->channels(), 1);
-    EXPECT_EQ(blob_top_label_->height(), 1);
-    EXPECT_EQ(blob_top_label_->width(), 1);
-
-    for (int iter = 0; iter < 2; ++iter) {
-      layer.Forward(blob_bottom_vec_, blob_top_vec_);
-      for (int i = 0; i < 5; ++i) {
-        EXPECT_EQ(i, blob_top_label_->cpu_data()[i]);
-      }
-      int num_with_center_value = 0;
-      for (int i = 0; i < 5; ++i) {
-        for (int j = 0; j < 2; ++j) {
-          const Dtype center_value = scale * (j ? 17 : 5);
-          num_with_center_value +=
-              (center_value == blob_top_data_->cpu_data()[i * 2 + j]);
-          // At TEST time, check that we always get center value.
-          if (phase == caffe::TEST) {
-            EXPECT_EQ(center_value, this->blob_top_data_->cpu_data()[i * 2 + j])
-                << "debug: iter " << iter << " i " << i << " j " << j;
-          }
-        }
-      }
-      // At TRAIN time, check that we did not get the center crop all 10 times.
-      // (This check fails with probability 1-1/12^10 in a correct
-      // implementation, so we call set_random_seed.)
-      if (phase == caffe::TRAIN) {
-        EXPECT_LT(num_with_center_value, 10);
-      }
-    }
-  }
-
-  void TestReadCropTrainSequenceSeeded() {
-    LayerParameter param;
-    param.set_phase(TRAIN);
-    DataParameter* data_param = param.mutable_data_param();
-    data_param->set_batch_size(5);
-    data_param->set_source(filename_->c_str());
-    data_param->set_backend(backend_);
-
-    TransformationParameter* transform_param =
-        param.mutable_transform_param();
-    transform_param->set_crop_size(1);
-    transform_param->set_mirror(true);
-
-    // Get crop sequence with Caffe seed 1701.
-    Caffe::set_random_seed(seed_);
-    vector<vector<Dtype> > crop_sequence;
-    {
-      DataLayer<Dtype> layer1(param);
-      layer1.SetUp(blob_bottom_vec_, blob_top_vec_);
-      for (int iter = 0; iter < 2; ++iter) {
-        layer1.Forward(blob_bottom_vec_, blob_top_vec_);
-        for (int i = 0; i < 5; ++i) {
-          EXPECT_EQ(i, blob_top_label_->cpu_data()[i]);
-        }
-        vector<Dtype> iter_crop_sequence;
-        for (int i = 0; i < 5; ++i) {
-          for (int j = 0; j < 2; ++j) {
-            iter_crop_sequence.push_back(
-                blob_top_data_->cpu_data()[i * 2 + j]);
-          }
-        }
-        crop_sequence.push_back(iter_crop_sequence);
-      }
-    }  // destroy 1st data layer and unlock the db
-
-    // Get crop sequence after reseeding Caffe with 1701.
-    // Check that the sequence is the same as the original.
-    Caffe::set_random_seed(seed_);
-    DataLayer<Dtype> layer2(param);
-    layer2.SetUp(blob_bottom_vec_, blob_top_vec_);
-    for (int iter = 0; iter < 2; ++iter) {
-      layer2.Forward(blob_bottom_vec_, blob_top_vec_);
-      for (int i = 0; i < 5; ++i) {
-        EXPECT_EQ(i, blob_top_label_->cpu_data()[i]);
-      }
-      for (int i = 0; i < 5; ++i) {
-        for (int j = 0; j < 2; ++j) {
-          EXPECT_EQ(crop_sequence[iter][i * 2 + j],
-                    blob_top_data_->cpu_data()[i * 2 + j])
-              << "debug: iter " << iter << " i " << i << " j " << j;
-        }
-      }
-    }
-  }
-
-  void TestReadCropTrainSequenceUnseeded() {
-    LayerParameter param;
-    param.set_phase(TRAIN);
-    DataParameter* data_param = param.mutable_data_param();
-    data_param->set_batch_size(5);
-    data_param->set_source(filename_->c_str());
-    data_param->set_backend(backend_);
-
-    TransformationParameter* transform_param =
-        param.mutable_transform_param();
-    transform_param->set_crop_size(1);
-    transform_param->set_mirror(true);
-
-    // Get crop sequence with Caffe seed 1701, srand seed 1701.
-    Caffe::set_random_seed(seed_);
-    srand(seed_);
-    vector<vector<Dtype> > crop_sequence;
-    {
-      DataLayer<Dtype> layer1(param);
-      layer1.SetUp(blob_bottom_vec_, blob_top_vec_);
-      for (int iter = 0; iter < 2; ++iter) {
-        layer1.Forward(blob_bottom_vec_, blob_top_vec_);
-        for (int i = 0; i < 5; ++i) {
-          EXPECT_EQ(i, blob_top_label_->cpu_data()[i]);
-        }
-        vector<Dtype> iter_crop_sequence;
-        for (int i = 0; i < 5; ++i) {
-          for (int j = 0; j < 2; ++j) {
-            iter_crop_sequence.push_back(
-                blob_top_data_->cpu_data()[i * 2 + j]);
-          }
-        }
-        crop_sequence.push_back(iter_crop_sequence);
-      }
-    }  // destroy 1st data layer and unlock the db
-
-    // Get crop sequence continuing from previous Caffe RNG state; reseed
-    // srand with 1701. Check that the sequence differs from the original.
-    srand(seed_);
-    DataLayer<Dtype> layer2(param);
-    layer2.SetUp(blob_bottom_vec_, blob_top_vec_);
-    for (int iter = 0; iter < 2; ++iter) {
-      layer2.Forward(blob_bottom_vec_, blob_top_vec_);
-      for (int i = 0; i < 5; ++i) {
-        EXPECT_EQ(i, blob_top_label_->cpu_data()[i]);
-      }
-      int num_sequence_matches = 0;
-      for (int i = 0; i < 5; ++i) {
-        for (int j = 0; j < 2; ++j) {
-          num_sequence_matches += (crop_sequence[iter][i * 2 + j] ==
-                                   blob_top_data_->cpu_data()[i * 2 + j]);
-        }
-      }
-      EXPECT_LT(num_sequence_matches, 10);
-    }
-  }
-
-  virtual ~DataLayerTest() { delete blob_top_data_; delete blob_top_label_; }
-
-  DataParameter_DB backend_;
-  shared_ptr<string> filename_;
-  Blob<Dtype>* const blob_top_data_;
-  Blob<Dtype>* const blob_top_label_;
-  vector<Blob<Dtype>*> blob_bottom_vec_;
-  vector<Blob<Dtype>*> blob_top_vec_;
-  int seed_;
-};
-
-TYPED_TEST_CASE(DataLayerTest, TestDtypesAndDevices);
-
-#ifdef USE_LEVELDB
-TYPED_TEST(DataLayerTest, TestReadLevelDB) {
-  const bool unique_pixels = false;  // all pixels the same; images different
-  this->Fill(unique_pixels, DataParameter_DB_LEVELDB);
-  this->TestRead();
-}
-
-TYPED_TEST(DataLayerTest, TestReshapeLevelDB) {
-  this->TestReshape(DataParameter_DB_LEVELDB);
-}
-
-TYPED_TEST(DataLayerTest, TestReadCropTrainLevelDB) {
-  const bool unique_pixels = true;  // all images the same; pixels different
-  this->Fill(unique_pixels, DataParameter_DB_LEVELDB);
-  this->TestReadCrop(TRAIN);
-}
-
-// Test that the sequence of random crops is consistent when using
-// Caffe::set_random_seed.
-TYPED_TEST(DataLayerTest, TestReadCropTrainSequenceSeededLevelDB) {
-  const bool unique_pixels = true;  // all images the same; pixels different
-  this->Fill(unique_pixels, DataParameter_DB_LEVELDB);
-  this->TestReadCropTrainSequenceSeeded();
-}
-
-// Test that the sequence of random crops differs across iterations when
-// Caffe::set_random_seed isn't called (and seeds from srand are ignored).
-TYPED_TEST(DataLayerTest, TestReadCropTrainSequenceUnseededLevelDB) {
-  const bool unique_pixels = true;  // all images the same; pixels different
-  this->Fill(unique_pixels, DataParameter_DB_LEVELDB);
-  this->TestReadCropTrainSequenceUnseeded();
-}
-
-TYPED_TEST(DataLayerTest, TestReadCropTestLevelDB) {
-  const bool unique_pixels = true;  // all images the same; pixels different
-  this->Fill(unique_pixels, DataParameter_DB_LEVELDB);
-  this->TestReadCrop(TEST);
-}
-#endif  // USE_LEVELDB
-
-#ifdef USE_LMDB
-TYPED_TEST(DataLayerTest, TestReadLMDB) {
-  const bool unique_pixels = false;  // all pixels the same; images different
-  this->Fill(unique_pixels, DataParameter_DB_LMDB);
-  this->TestRead();
-}
-
-TYPED_TEST(DataLayerTest, TestReshapeLMDB) {
-  this->TestReshape(DataParameter_DB_LMDB);
-}
-
-TYPED_TEST(DataLayerTest, TestReadCropTrainLMDB) {
-  const bool unique_pixels = true;  // all images the same; pixels different
-  this->Fill(unique_pixels, DataParameter_DB_LMDB);
-  this->TestReadCrop(TRAIN);
-}
-
-// Test that the sequence of random crops is consistent when using
-// Caffe::set_random_seed.
-TYPED_TEST(DataLayerTest, TestReadCropTrainSequenceSeededLMDB) {
-  const bool unique_pixels = true;  // all images the same; pixels different
-  this->Fill(unique_pixels, DataParameter_DB_LMDB);
-  this->TestReadCropTrainSequenceSeeded();
-}
-
-// Test that the sequence of random crops differs across iterations when
-// Caffe::set_random_seed isn't called (and seeds from srand are ignored).
-TYPED_TEST(DataLayerTest, TestReadCropTrainSequenceUnseededLMDB) {
-  const bool unique_pixels = true;  // all images the same; pixels different
-  this->Fill(unique_pixels, DataParameter_DB_LMDB);
-  this->TestReadCropTrainSequenceUnseeded();
-}
-
-TYPED_TEST(DataLayerTest, TestReadCropTestLMDB) {
-  const bool unique_pixels = true;  // all images the same; pixels different
-  this->Fill(unique_pixels, DataParameter_DB_LMDB);
-  this->TestReadCrop(TEST);
-}
-
-#endif  // USE_LMDB
-}  // namespace caffe
-#endif  // USE_OPENCV
diff --git a/src/caffe/test/test_data_transformer.cpp b/src/caffe/test/test_data_transformer.cpp
deleted file mode 100644
index 31bf1c1..0000000
--- a/src/caffe/test/test_data_transformer.cpp
+++ /dev/null
@@ -1,344 +0,0 @@
-#ifdef USE_OPENCV
-#include <string>
-#include <vector>
-
-#include "gtest/gtest.h"
-
-#include "caffe/blob.hpp"
-#include "caffe/common.hpp"
-#include "caffe/data_transformer.hpp"
-#include "caffe/filler.hpp"
-#include "caffe/proto/caffe.pb.h"
-#include "caffe/util/io.hpp"
-
-#include "caffe/test/test_caffe_main.hpp"
-
-namespace caffe {
-
-void FillDatum(const int label, const int channels, const int height,
-  const int width, const bool unique_pixels, Datum * datum) {
-  datum->set_label(label);
-  datum->set_channels(channels);
-  datum->set_height(height);
-  datum->set_width(width);
-  int size = channels * height * width;
-  std::string* data = datum->mutable_data();
-  for (int j = 0; j < size; ++j) {
-    int datum = unique_pixels ? j : label;
-    data->push_back(static_cast<uint8_t>(datum));
-  }
-}
-
-template <typename Dtype>
-class DataTransformTest : public ::testing::Test {
- protected:
-  DataTransformTest()
-      : seed_(1701),
-      num_iter_(10) {}
-
-  int NumSequenceMatches(const TransformationParameter transform_param,
-      const Datum& datum, Phase phase) {
-    // Get crop sequence with Caffe seed 1701.
-    DataTransformer<Dtype> transformer(transform_param, phase);
-    const int crop_size = transform_param.crop_size();
-    Caffe::set_random_seed(seed_);
-    transformer.InitRand();
-    Blob<Dtype> blob(1, datum.channels(), datum.height(), datum.width());
-    if (transform_param.crop_size() > 0) {
-      blob.Reshape(1, datum.channels(), crop_size, crop_size);
-    }
-
-    vector<vector<Dtype> > crop_sequence;
-    for (int iter = 0; iter < this->num_iter_; ++iter) {
-      vector<Dtype> iter_crop_sequence;
-      transformer.Transform(datum, &blob);
-      for (int j = 0; j < blob.count(); ++j) {
-        iter_crop_sequence.push_back(blob.cpu_data()[j]);
-      }
-      crop_sequence.push_back(iter_crop_sequence);
-    }
-    // Check if the sequence differs from the previous
-    int num_sequence_matches = 0;
-    for (int iter = 0; iter < this->num_iter_; ++iter) {
-      vector<Dtype> iter_crop_sequence = crop_sequence[iter];
-      transformer.Transform(datum, &blob);
-      for (int j = 0; j < blob.count(); ++j) {
-        num_sequence_matches += (crop_sequence[iter][j] == blob.cpu_data()[j]);
-      }
-    }
-    return num_sequence_matches;
-  }
-
-  int seed_;
-  int num_iter_;
-};
-
-TYPED_TEST_CASE(DataTransformTest, TestDtypes);
-
-TYPED_TEST(DataTransformTest, TestEmptyTransform) {
-  TransformationParameter transform_param;
-  const bool unique_pixels = false;  // all pixels the same equal to label
-  const int label = 0;
-  const int channels = 3;
-  const int height = 4;
-  const int width = 5;
-
-  Datum datum;
-  FillDatum(label, channels, height, width, unique_pixels, &datum);
-  Blob<TypeParam> blob(1, channels, height, width);
-  DataTransformer<TypeParam> transformer(transform_param, TEST);
-  transformer.InitRand();
-  transformer.Transform(datum, &blob);
-  EXPECT_EQ(blob.num(), 1);
-  EXPECT_EQ(blob.channels(), datum.channels());
-  EXPECT_EQ(blob.height(), datum.height());
-  EXPECT_EQ(blob.width(), datum.width());
-  for (int j = 0; j < blob.count(); ++j) {
-    EXPECT_EQ(blob.cpu_data()[j], label);
-  }
-}
-
-TYPED_TEST(DataTransformTest, TestEmptyTransformUniquePixels) {
-  TransformationParameter transform_param;
-  const bool unique_pixels = true;  // pixels are consecutive ints [0,size]
-  const int label = 0;
-  const int channels = 3;
-  const int height = 4;
-  const int width = 5;
-
-  Datum datum;
-  FillDatum(label, channels, height, width, unique_pixels, &datum);
-  Blob<TypeParam> blob(1, 3, 4, 5);
-  DataTransformer<TypeParam> transformer(transform_param, TEST);
-  transformer.InitRand();
-  transformer.Transform(datum, &blob);
-  EXPECT_EQ(blob.num(), 1);
-  EXPECT_EQ(blob.channels(), datum.channels());
-  EXPECT_EQ(blob.height(), datum.height());
-  EXPECT_EQ(blob.width(), datum.width());
-  for (int j = 0; j < blob.count(); ++j) {
-    EXPECT_EQ(blob.cpu_data()[j], j);
-  }
-}
-
-TYPED_TEST(DataTransformTest, TestCropSize) {
-  TransformationParameter transform_param;
-  const bool unique_pixels = false;  // all pixels the same equal to label
-  const int label = 0;
-  const int channels = 3;
-  const int height = 4;
-  const int width = 5;
-  const int crop_size = 2;
-
-  transform_param.set_crop_size(crop_size);
-  Datum datum;
-  FillDatum(label, channels, height, width, unique_pixels, &datum);
-  DataTransformer<TypeParam> transformer(transform_param, TEST);
-  transformer.InitRand();
-  Blob<TypeParam> blob(1, channels, crop_size, crop_size);
-  for (int iter = 0; iter < this->num_iter_; ++iter) {
-    transformer.Transform(datum, &blob);
-    EXPECT_EQ(blob.num(), 1);
-    EXPECT_EQ(blob.channels(), datum.channels());
-    EXPECT_EQ(blob.height(), crop_size);
-    EXPECT_EQ(blob.width(), crop_size);
-    for (int j = 0; j < blob.count(); ++j) {
-      EXPECT_EQ(blob.cpu_data()[j], label);
-    }
-  }
-}
-
-TYPED_TEST(DataTransformTest, TestCropTrain) {
-  TransformationParameter transform_param;
-  const bool unique_pixels = true;  // pixels are consecutive ints [0,size]
-  const int label = 0;
-  const int channels = 3;
-  const int height = 4;
-  const int width = 5;
-  const int crop_size = 2;
-  const int size = channels * crop_size * crop_size;
-
-  transform_param.set_crop_size(crop_size);
-  Datum datum;
-  FillDatum(label, channels, height, width, unique_pixels, &datum);
-  int num_matches = this->NumSequenceMatches(transform_param, datum, TRAIN);
-  EXPECT_LT(num_matches, size * this->num_iter_);
-}
-
-TYPED_TEST(DataTransformTest, TestCropTest) {
-  TransformationParameter transform_param;
-  const bool unique_pixels = true;  // pixels are consecutive ints [0,size]
-  const int label = 0;
-  const int channels = 3;
-  const int height = 4;
-  const int width = 5;
-  const int crop_size = 2;
-  const int size = channels * crop_size * crop_size;
-
-  transform_param.set_crop_size(crop_size);
-  Datum datum;
-  FillDatum(label, channels, height, width, unique_pixels, &datum);
-  int num_matches = this->NumSequenceMatches(transform_param, datum, TEST);
-  EXPECT_EQ(num_matches, size * this->num_iter_);
-}
-
-TYPED_TEST(DataTransformTest, TestMirrorTrain) {
-  TransformationParameter transform_param;
-  const bool unique_pixels = true;  // pixels are consecutive ints [0,size]
-  const int label = 0;
-  const int channels = 3;
-  const int height = 4;
-  const int width = 5;
-  const int size = channels * height * width;
-
-  transform_param.set_mirror(true);
-  Datum datum;
-  FillDatum(label, channels, height, width, unique_pixels, &datum);
-  int num_matches = this->NumSequenceMatches(transform_param, datum, TRAIN);
-  EXPECT_LT(num_matches, size * this->num_iter_);
-}
-
-TYPED_TEST(DataTransformTest, TestMirrorTest) {
-  TransformationParameter transform_param;
-  const bool unique_pixels = true;  // pixels are consecutive ints [0,size]
-  const int label = 0;
-  const int channels = 3;
-  const int height = 4;
-  const int width = 5;
-  const int size = channels * height * width;
-
-  transform_param.set_mirror(true);
-  Datum datum;
-  FillDatum(label, channels, height, width, unique_pixels, &datum);
-  int num_matches = this->NumSequenceMatches(transform_param, datum, TEST);
-  EXPECT_LT(num_matches, size * this->num_iter_);
-}
-
-TYPED_TEST(DataTransformTest, TestCropMirrorTrain) {
-  TransformationParameter transform_param;
-  const bool unique_pixels = true;  // pixels are consecutive ints [0,size]
-  const int label = 0;
-  const int channels = 3;
-  const int height = 4;
-  const int width = 5;
-  const int crop_size = 2;
-
-  Datum datum;
-  FillDatum(label, channels, height, width, unique_pixels, &datum);
-  transform_param.set_crop_size(crop_size);
-  int num_matches_crop = this->NumSequenceMatches(
-      transform_param, datum, TRAIN);
-
-  transform_param.set_mirror(true);
-  int num_matches_crop_mirror =
-      this->NumSequenceMatches(transform_param, datum, TRAIN);
-  // When doing crop and mirror we expect less num_matches than just crop
-  EXPECT_LE(num_matches_crop_mirror, num_matches_crop);
-}
-
-TYPED_TEST(DataTransformTest, TestCropMirrorTest) {
-  TransformationParameter transform_param;
-  const bool unique_pixels = true;  // pixels are consecutive ints [0,size]
-  const int label = 0;
-  const int channels = 3;
-  const int height = 4;
-  const int width = 5;
-  const int crop_size = 2;
-
-  Datum datum;
-  FillDatum(label, channels, height, width, unique_pixels, &datum);
-  transform_param.set_crop_size(crop_size);
-  int num_matches_crop = this->NumSequenceMatches(transform_param, datum, TEST);
-
-  transform_param.set_mirror(true);
-  int num_matches_crop_mirror =
-      this->NumSequenceMatches(transform_param, datum, TEST);
-  // When doing crop and mirror we expect less num_matches than just crop
-  EXPECT_LT(num_matches_crop_mirror, num_matches_crop);
-}
-
-
-TYPED_TEST(DataTransformTest, TestMeanValue) {
-  TransformationParameter transform_param;
-  const bool unique_pixels = false;  // pixels are equal to label
-  const int label = 0;
-  const int channels = 3;
-  const int height = 4;
-  const int width = 5;
-  const int mean_value = 2;
-
-  transform_param.add_mean_value(mean_value);
-  Datum datum;
-  FillDatum(label, channels, height, width, unique_pixels, &datum);
-  Blob<TypeParam> blob(1, channels, height, width);
-  DataTransformer<TypeParam> transformer(transform_param, TEST);
-  transformer.InitRand();
-  transformer.Transform(datum, &blob);
-  for (int j = 0; j < blob.count(); ++j) {
-    EXPECT_EQ(blob.cpu_data()[j], label - mean_value);
-  }
-}
-
-TYPED_TEST(DataTransformTest, TestMeanValues) {
-  TransformationParameter transform_param;
-  const bool unique_pixels = false;  // pixels are equal to label
-  const int label = 0;
-  const int channels = 3;
-  const int height = 4;
-  const int width = 5;
-
-  transform_param.add_mean_value(0);
-  transform_param.add_mean_value(1);
-  transform_param.add_mean_value(2);
-  Datum datum;
-  FillDatum(label, channels, height, width, unique_pixels, &datum);
-  Blob<TypeParam> blob(1, channels, height, width);
-  DataTransformer<TypeParam> transformer(transform_param, TEST);
-  transformer.InitRand();
-  transformer.Transform(datum, &blob);
-  for (int c = 0; c < channels; ++c) {
-    for (int j = 0; j < height * width; ++j) {
-      EXPECT_EQ(blob.cpu_data()[blob.offset(0, c) + j], label - c);
-    }
-  }
-}
-
-TYPED_TEST(DataTransformTest, TestMeanFile) {
-  TransformationParameter transform_param;
-  const bool unique_pixels = true;  // pixels are consecutive ints [0,size]
-  const int label = 0;
-  const int channels = 3;
-  const int height = 4;
-  const int width = 5;
-  const int size = channels * height * width;
-
-  // Create a mean file
-  string mean_file;
-  MakeTempFilename(&mean_file);
-  BlobProto blob_mean;
-  blob_mean.set_num(1);
-  blob_mean.set_channels(channels);
-  blob_mean.set_height(height);
-  blob_mean.set_width(width);
-
-  for (int j = 0; j < size; ++j) {
-      blob_mean.add_data(j);
-  }
-
-  LOG(INFO) << "Using temporary mean_file " << mean_file;
-  WriteProtoToBinaryFile(blob_mean, mean_file);
-
-  transform_param.set_mean_file(mean_file);
-  Datum datum;
-  FillDatum(label, channels, height, width, unique_pixels, &datum);
-  Blob<TypeParam> blob(1, channels, height, width);
-  DataTransformer<TypeParam> transformer(transform_param, TEST);
-  transformer.InitRand();
-  transformer.Transform(datum, &blob);
-  for (int j = 0; j < blob.count(); ++j) {
-    EXPECT_EQ(blob.cpu_data()[j], 0);
-  }
-}
-
-}  // namespace caffe
-#endif  // USE_OPENCV
diff --git a/src/caffe/test/test_db.cpp b/src/caffe/test/test_db.cpp
deleted file mode 100644
index 1b487b1..0000000
--- a/src/caffe/test/test_db.cpp
+++ /dev/null
@@ -1,136 +0,0 @@
-#if defined(USE_LEVELDB) && defined(USE_LMDB) && defined(USE_OPENCV)
-#include <string>
-
-#include "boost/scoped_ptr.hpp"
-#include "gtest/gtest.h"
-
-#include "caffe/common.hpp"
-#include "caffe/proto/caffe.pb.h"
-#include "caffe/util/db.hpp"
-#include "caffe/util/io.hpp"
-
-#include "caffe/test/test_caffe_main.hpp"
-
-namespace caffe {
-
-using boost::scoped_ptr;
-
-template <typename TypeParam>
-class DBTest : public ::testing::Test {
- protected:
-  DBTest()
-      : backend_(TypeParam::backend),
-      root_images_(string(EXAMPLES_SOURCE_DIR) + string("images/")) {}
-
-  virtual void SetUp() {
-    MakeTempDir(&source_);
-    source_ += "/db";
-    string keys[] = {"cat.jpg", "fish-bike.jpg"};
-    LOG(INFO) << "Using temporary db " << source_;
-    scoped_ptr<db::DB> db(db::GetDB(TypeParam::backend));
-    db->Open(this->source_, db::NEW);
-    scoped_ptr<db::Transaction> txn(db->NewTransaction());
-    for (int i = 0; i < 2; ++i) {
-      Datum datum;
-      ReadImageToDatum(root_images_ + keys[i], i, &datum);
-      string out;
-      CHECK(datum.SerializeToString(&out));
-      txn->Put(keys[i], out);
-    }
-    txn->Commit();
-  }
-
-  virtual ~DBTest() { }
-
-  DataParameter_DB backend_;
-  string source_;
-  string root_images_;
-};
-
-struct TypeLevelDB {
-  static DataParameter_DB backend;
-};
-DataParameter_DB TypeLevelDB::backend = DataParameter_DB_LEVELDB;
-
-struct TypeLMDB {
-  static DataParameter_DB backend;
-};
-DataParameter_DB TypeLMDB::backend = DataParameter_DB_LMDB;
-
-// typedef ::testing::Types<TypeLmdb> TestTypes;
-typedef ::testing::Types<TypeLevelDB, TypeLMDB> TestTypes;
-
-TYPED_TEST_CASE(DBTest, TestTypes);
-
-TYPED_TEST(DBTest, TestGetDB) {
-  scoped_ptr<db::DB> db(db::GetDB(TypeParam::backend));
-}
-
-TYPED_TEST(DBTest, TestNext) {
-  scoped_ptr<db::DB> db(db::GetDB(TypeParam::backend));
-  db->Open(this->source_, db::READ);
-  scoped_ptr<db::Cursor> cursor(db->NewCursor());
-  EXPECT_TRUE(cursor->valid());
-  cursor->Next();
-  EXPECT_TRUE(cursor->valid());
-  cursor->Next();
-  EXPECT_FALSE(cursor->valid());
-}
-
-TYPED_TEST(DBTest, TestSeekToFirst) {
-  scoped_ptr<db::DB> db(db::GetDB(TypeParam::backend));
-  db->Open(this->source_, db::READ);
-  scoped_ptr<db::Cursor> cursor(db->NewCursor());
-  cursor->Next();
-  cursor->SeekToFirst();
-  EXPECT_TRUE(cursor->valid());
-  string key = cursor->key();
-  Datum datum;
-  datum.ParseFromString(cursor->value());
-  EXPECT_EQ(key, "cat.jpg");
-  EXPECT_EQ(datum.channels(), 3);
-  EXPECT_EQ(datum.height(), 360);
-  EXPECT_EQ(datum.width(), 480);
-}
-
-TYPED_TEST(DBTest, TestKeyValue) {
-  scoped_ptr<db::DB> db(db::GetDB(TypeParam::backend));
-  db->Open(this->source_, db::READ);
-  scoped_ptr<db::Cursor> cursor(db->NewCursor());
-  EXPECT_TRUE(cursor->valid());
-  string key = cursor->key();
-  Datum datum;
-  datum.ParseFromString(cursor->value());
-  EXPECT_EQ(key, "cat.jpg");
-  EXPECT_EQ(datum.channels(), 3);
-  EXPECT_EQ(datum.height(), 360);
-  EXPECT_EQ(datum.width(), 480);
-  cursor->Next();
-  EXPECT_TRUE(cursor->valid());
-  key = cursor->key();
-  datum.ParseFromString(cursor->value());
-  EXPECT_EQ(key, "fish-bike.jpg");
-  EXPECT_EQ(datum.channels(), 3);
-  EXPECT_EQ(datum.height(), 323);
-  EXPECT_EQ(datum.width(), 481);
-  cursor->Next();
-  EXPECT_FALSE(cursor->valid());
-}
-
-TYPED_TEST(DBTest, TestWrite) {
-  scoped_ptr<db::DB> db(db::GetDB(TypeParam::backend));
-  db->Open(this->source_, db::WRITE);
-  scoped_ptr<db::Transaction> txn(db->NewTransaction());
-  Datum datum;
-  ReadFileToDatum(this->root_images_ + "cat.jpg", 0, &datum);
-  string out;
-  CHECK(datum.SerializeToString(&out));
-  txn->Put("cat.jpg", out);
-  ReadFileToDatum(this->root_images_ + "fish-bike.jpg", 1, &datum);
-  CHECK(datum.SerializeToString(&out));
-  txn->Put("fish-bike.jpg", out);
-  txn->Commit();
-}
-
-}  // namespace caffe
-#endif  // USE_LEVELDB, USE_LMDB and USE_OPENCV
diff --git a/src/caffe/test/test_deconvolution_layer.cpp b/src/caffe/test/test_deconvolution_layer.cpp
deleted file mode 100644
index c4b09ad..0000000
--- a/src/caffe/test/test_deconvolution_layer.cpp
+++ /dev/null
@@ -1,304 +0,0 @@
-#include <vector>
-
-#include "gtest/gtest.h"
-
-#include "caffe/blob.hpp"
-#include "caffe/common.hpp"
-#include "caffe/filler.hpp"
-#include "caffe/layers/deconv_layer.hpp"
-
-#include "caffe/test/test_caffe_main.hpp"
-#include "caffe/test/test_gradient_check_util.hpp"
-
-namespace caffe {
-
-// Since ConvolutionLayerTest checks the shared conv/deconv code in detail,
-// we'll just do a simple forward test and a gradient check.
-template <typename TypeParam>
-class DeconvolutionLayerTest : public MultiDeviceTest<TypeParam> {
-  typedef typename TypeParam::Dtype Dtype;
-
- protected:
-  DeconvolutionLayerTest()
-      : blob_bottom_(new Blob<Dtype>(2, 3, 6, 4)),
-        blob_bottom_2_(new Blob<Dtype>(2, 3, 6, 4)),
-        blob_top_(new Blob<Dtype>()),
-        blob_top_2_(new Blob<Dtype>()) {}
-  virtual void SetUp() {
-    // fill the values
-    FillerParameter filler_param;
-    filler_param.set_value(1.);
-    GaussianFiller<Dtype> filler(filler_param);
-    filler.Fill(this->blob_bottom_);
-    filler.Fill(this->blob_bottom_2_);
-    blob_bottom_vec_.push_back(blob_bottom_);
-    blob_top_vec_.push_back(blob_top_);
-  }
-
-  virtual ~DeconvolutionLayerTest() {
-    delete blob_bottom_;
-    delete blob_bottom_2_;
-    delete blob_top_;
-    delete blob_top_2_;
-  }
-
-  Blob<Dtype>* const blob_bottom_;
-  Blob<Dtype>* const blob_bottom_2_;
-  Blob<Dtype>* const blob_top_;
-  Blob<Dtype>* const blob_top_2_;
-  vector<Blob<Dtype>*> blob_bottom_vec_;
-  vector<Blob<Dtype>*> blob_top_vec_;
-};
-
-TYPED_TEST_CASE(DeconvolutionLayerTest, TestDtypesAndDevices);
-
-TYPED_TEST(DeconvolutionLayerTest, TestSetup) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  ConvolutionParameter* convolution_param =
-      layer_param.mutable_convolution_param();
-  convolution_param->add_kernel_size(3);
-  convolution_param->add_stride(2);
-  convolution_param->set_num_output(4);
-  this->blob_bottom_vec_.push_back(this->blob_bottom_2_);
-  this->blob_top_vec_.push_back(this->blob_top_2_);
-  shared_ptr<Layer<Dtype> > layer(
-      new DeconvolutionLayer<Dtype>(layer_param));
-  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  EXPECT_EQ(this->blob_top_->num(), 2);
-  EXPECT_EQ(this->blob_top_->channels(), 4);
-  EXPECT_EQ(this->blob_top_->height(), 13);
-  EXPECT_EQ(this->blob_top_->width(), 9);
-  EXPECT_EQ(this->blob_top_2_->num(), 2);
-  EXPECT_EQ(this->blob_top_2_->channels(), 4);
-  EXPECT_EQ(this->blob_top_2_->height(), 13);
-  EXPECT_EQ(this->blob_top_2_->width(), 9);
-  // setting group should not change the shape
-  convolution_param->set_num_output(3);
-  convolution_param->set_group(3);
-  layer.reset(new DeconvolutionLayer<Dtype>(layer_param));
-  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  EXPECT_EQ(this->blob_top_->num(), 2);
-  EXPECT_EQ(this->blob_top_->channels(), 3);
-  EXPECT_EQ(this->blob_top_->height(), 13);
-  EXPECT_EQ(this->blob_top_->width(), 9);
-  EXPECT_EQ(this->blob_top_2_->num(), 2);
-  EXPECT_EQ(this->blob_top_2_->channels(), 3);
-  EXPECT_EQ(this->blob_top_2_->height(), 13);
-  EXPECT_EQ(this->blob_top_2_->width(), 9);
-}
-
-TYPED_TEST(DeconvolutionLayerTest, TestSimpleDeconvolution) {
-  typedef typename TypeParam::Dtype Dtype;
-  this->blob_bottom_vec_.push_back(this->blob_bottom_2_);
-  this->blob_top_vec_.push_back(this->blob_top_2_);
-  LayerParameter layer_param;
-  ConvolutionParameter* convolution_param =
-      layer_param.mutable_convolution_param();
-  convolution_param->add_kernel_size(3);
-  convolution_param->add_stride(2);
-  convolution_param->set_num_output(4);
-  convolution_param->mutable_weight_filler()->set_type("constant");
-  convolution_param->mutable_weight_filler()->set_value(1);
-  convolution_param->mutable_bias_filler()->set_type("constant");
-  convolution_param->mutable_bias_filler()->set_value(0.1);
-  shared_ptr<Layer<Dtype> > layer(
-      new DeconvolutionLayer<Dtype>(layer_param));
-  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  // constant-fill the bottom blobs
-  FillerParameter filler_param;
-  filler_param.set_value(1.);
-  ConstantFiller<Dtype> filler(filler_param);
-  filler.Fill(this->blob_bottom_);
-  filler.Fill(this->blob_bottom_2_);
-  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  // simply check that accumulation works with overlapping filters
-  const Dtype* top_data = this->blob_top_->cpu_data();
-  for (int n = 0; n < this->blob_top_->num(); ++n) {
-    for (int c = 0; c < this->blob_top_->channels(); ++c) {
-      for (int h = 0; h < this->blob_top_->height(); ++h) {
-        for (int w = 0; w < this->blob_top_->width(); ++w) {
-          Dtype expected = 3.1;
-          bool h_overlap = h % 2 == 0 && h > 0
-            && h < this->blob_top_->height() - 1;
-          bool w_overlap = w % 2 == 0 && w > 0
-            && w < this->blob_top_->width() - 1;
-          if (h_overlap && w_overlap) {
-            expected += 9;
-          } else if (h_overlap || w_overlap) {
-            expected += 3;
-          }
-          EXPECT_NEAR(top_data[this->blob_top_->offset(n, c, h, w)],
-              expected, 1e-4);
-        }
-      }
-    }
-  }
-}
-
-TYPED_TEST(DeconvolutionLayerTest, TestGradient) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  ConvolutionParameter* convolution_param =
-      layer_param.mutable_convolution_param();
-  this->blob_bottom_vec_.push_back(this->blob_bottom_2_);
-  this->blob_top_vec_.push_back(this->blob_top_2_);
-  convolution_param->add_kernel_size(2);
-  convolution_param->add_stride(1);
-  convolution_param->set_num_output(1);
-  convolution_param->mutable_weight_filler()->set_type("gaussian");
-  convolution_param->mutable_bias_filler()->set_type("gaussian");
-  DeconvolutionLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-2, 1e-3);
-  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_);
-}
-
-TYPED_TEST(DeconvolutionLayerTest, TestNDAgainst2D) {
-  typedef typename TypeParam::Dtype Dtype;
-  const int kernel_h = 11;
-  const int kernel_w = 13;
-  vector<int> bottom_shape(4);
-  bottom_shape[0] = 15;
-  bottom_shape[1] = 12;
-  bottom_shape[2] = kernel_h * 2;
-  bottom_shape[3] = kernel_w * 2;
-  FillerParameter filler_param;
-  GaussianFiller<Dtype> filler(filler_param);
-  for (int i = 0; i < this->blob_bottom_vec_.size(); ++i) {
-    this->blob_bottom_vec_[i]->Reshape(bottom_shape);
-    filler.Fill(this->blob_bottom_vec_[i]);
-  }
-  LayerParameter layer_param;
-  ConvolutionParameter* convolution_param =
-      layer_param.mutable_convolution_param();
-  convolution_param->set_num_output(18);
-  convolution_param->set_bias_term(false);
-  convolution_param->set_group(6);
-  convolution_param->set_kernel_h(kernel_h);
-  convolution_param->set_kernel_w(kernel_w);
-  convolution_param->mutable_weight_filler()->set_type("gaussian");
-  Blob<Dtype> weights;
-  Blob<Dtype> top_diff;
-  // Shape and fill weights and top_diff.
-  bool copy_diff;
-  bool reshape;
-  {
-    DeconvolutionLayer<Dtype> layer(layer_param);
-    layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-    top_diff.ReshapeLike(*this->blob_top_);
-    filler.Fill(&top_diff);
-    ASSERT_EQ(1, layer.blobs().size());
-    copy_diff = false; reshape = true;
-    weights.CopyFrom(*layer.blobs()[0], copy_diff, reshape);
-  }
-  vector<bool> propagate_down(1, true);
-  Blob<Dtype> result_2d;
-  Blob<Dtype> backward_result_2d;
-  Blob<Dtype> backward_weight_result_2d;
-  // Test with 2D im2col
-  {
-    caffe_set(this->blob_top_->count(), Dtype(0),
-              this->blob_top_->mutable_cpu_data());
-    caffe_set(this->blob_bottom_->count(), Dtype(0),
-              this->blob_bottom_->mutable_cpu_diff());
-    caffe_set(weights.count(), Dtype(0), weights.mutable_cpu_diff());
-    // Do SetUp and Forward; save Forward result in result_2d.
-    convolution_param->set_force_nd_im2col(false);
-    DeconvolutionLayer<Dtype> layer_2d(layer_param);
-    layer_2d.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-    ASSERT_EQ(1, layer_2d.blobs().size());
-    copy_diff = false; reshape = false;
-    layer_2d.blobs()[0]->CopyFrom(weights, copy_diff, reshape);
-    layer_2d.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-    copy_diff = false; reshape = true;
-    result_2d.CopyFrom(*this->blob_top_, copy_diff, reshape);
-    // Copy pre-generated top diff into actual top diff;
-    // do Backward and save result in backward_result_2d.
-    ASSERT_EQ(this->blob_top_->shape(), top_diff.shape());
-    caffe_copy(top_diff.count(), top_diff.cpu_data(),
-               this->blob_top_->mutable_cpu_diff());
-    layer_2d.Backward(this->blob_top_vec_, propagate_down,
-                      this->blob_bottom_vec_);
-    copy_diff = true; reshape = true;
-    backward_result_2d.CopyFrom(*this->blob_bottom_, copy_diff, reshape);
-    backward_weight_result_2d.CopyFrom(weights, copy_diff, reshape);
-  }
-  Blob<Dtype> result_nd;
-  Blob<Dtype> backward_result_nd;
-  Blob<Dtype> backward_weight_result_nd;
-  // Test with ND im2col
-  {
-    caffe_set(this->blob_top_->count(), Dtype(0),
-              this->blob_top_->mutable_cpu_data());
-    caffe_set(this->blob_bottom_->count(), Dtype(0),
-              this->blob_bottom_->mutable_cpu_diff());
-    caffe_set(weights.count(), Dtype(0), weights.mutable_cpu_diff());
-    // Do SetUp and Forward; save Forward result in result_nd.
-    convolution_param->set_force_nd_im2col(true);
-    DeconvolutionLayer<Dtype> layer_nd(layer_param);
-    layer_nd.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-    ASSERT_EQ(1, layer_nd.blobs().size());
-    copy_diff = false; reshape = false;
-    layer_nd.blobs()[0]->CopyFrom(weights, copy_diff, reshape);
-    layer_nd.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-    copy_diff = false; reshape = true;
-    result_nd.CopyFrom(*this->blob_top_, copy_diff, reshape);
-    // Copy pre-generated top diff into actual top diff;
-    // do Backward and save result in backward_result_nd.
-    ASSERT_EQ(this->blob_top_->shape(), top_diff.shape());
-    caffe_copy(top_diff.count(), top_diff.cpu_data(),
-               this->blob_top_->mutable_cpu_diff());
-    layer_nd.Backward(this->blob_top_vec_, propagate_down,
-                      this->blob_bottom_vec_);
-    copy_diff = true; reshape = true;
-    backward_result_nd.CopyFrom(*this->blob_bottom_, copy_diff, reshape);
-    backward_weight_result_nd.CopyFrom(weights, copy_diff, reshape);
-  }
-  ASSERT_EQ(result_nd.count(), result_2d.count());
-  for (int i = 0; i < result_2d.count(); ++i)  {
-    EXPECT_EQ(result_2d.cpu_data()[i], result_nd.cpu_data()[i]);
-  }
-  ASSERT_EQ(backward_result_nd.count(), backward_result_2d.count());
-  for (int i = 0; i < backward_result_2d.count(); ++i) {
-    EXPECT_EQ(backward_result_2d.cpu_diff()[i],
-              backward_result_nd.cpu_diff()[i]);
-  }
-  ASSERT_EQ(backward_weight_result_nd.count(),
-            backward_weight_result_2d.count());
-  for (int i = 0; i < backward_weight_result_2d.count(); ++i) {
-    EXPECT_EQ(backward_weight_result_2d.cpu_diff()[i],
-              backward_weight_result_nd.cpu_diff()[i]);
-  }
-}
-
-TYPED_TEST(DeconvolutionLayerTest, TestGradient3D) {
-  typedef typename TypeParam::Dtype Dtype;
-  vector<int> bottom_shape(5);
-  bottom_shape[0] = this->blob_bottom_vec_[0]->shape(0);
-  bottom_shape[1] = this->blob_bottom_vec_[0]->shape(1);
-  bottom_shape[2] = 2;
-  bottom_shape[3] = 3;
-  bottom_shape[4] = 2;
-  FillerParameter filler_param;
-  GaussianFiller<Dtype> filler(filler_param);
-  for (int i = 0; i < this->blob_bottom_vec_.size(); ++i) {
-    this->blob_bottom_vec_[i]->Reshape(bottom_shape);
-    filler.Fill(this->blob_bottom_vec_[i]);
-  }
-  LayerParameter layer_param;
-  ConvolutionParameter* convolution_param =
-      layer_param.mutable_convolution_param();
-  convolution_param->add_kernel_size(2);
-  convolution_param->add_stride(2);
-  convolution_param->add_pad(1);
-  convolution_param->set_num_output(2);
-  convolution_param->mutable_weight_filler()->set_type("gaussian");
-  convolution_param->mutable_bias_filler()->set_type("gaussian");
-  DeconvolutionLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-2, 1e-3);
-  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_);
-}
-
-}  // namespace caffe
diff --git a/src/caffe/test/test_dummy_data_layer.cpp b/src/caffe/test/test_dummy_data_layer.cpp
deleted file mode 100644
index 1a01ca8..0000000
--- a/src/caffe/test/test_dummy_data_layer.cpp
+++ /dev/null
@@ -1,193 +0,0 @@
-#include <string>
-#include <vector>
-
-#include "gtest/gtest.h"
-
-#include "caffe/blob.hpp"
-#include "caffe/common.hpp"
-#include "caffe/layers/dummy_data_layer.hpp"
-#include "caffe/proto/caffe.pb.h"
-
-#include "caffe/test/test_caffe_main.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-class DummyDataLayerTest : public CPUDeviceTest<Dtype> {
- protected:
-  DummyDataLayerTest()
-      : blob_top_a_(new Blob<Dtype>()),
-        blob_top_b_(new Blob<Dtype>()),
-        blob_top_c_(new Blob<Dtype>()) {}
-
-  virtual void SetUp() {
-    blob_bottom_vec_.clear();
-    blob_top_vec_.clear();
-    blob_top_vec_.push_back(blob_top_a_);
-    blob_top_vec_.push_back(blob_top_b_);
-    blob_top_vec_.push_back(blob_top_c_);
-  }
-
-  virtual ~DummyDataLayerTest() {
-    delete blob_top_a_;
-    delete blob_top_b_;
-    delete blob_top_c_;
-  }
-
-  Blob<Dtype>* const blob_top_a_;
-  Blob<Dtype>* const blob_top_b_;
-  Blob<Dtype>* const blob_top_c_;
-  vector<Blob<Dtype>*> blob_bottom_vec_;
-  vector<Blob<Dtype>*> blob_top_vec_;
-};
-
-TYPED_TEST_CASE(DummyDataLayerTest, TestDtypes);
-
-TYPED_TEST(DummyDataLayerTest, TestOneTopConstant) {
-  LayerParameter param;
-  DummyDataParameter* dummy_data_param = param.mutable_dummy_data_param();
-  dummy_data_param->add_num(5);
-  dummy_data_param->add_channels(3);
-  dummy_data_param->add_height(2);
-  dummy_data_param->add_width(4);
-  this->blob_top_vec_.resize(1);
-  DummyDataLayer<TypeParam> layer(param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  EXPECT_EQ(this->blob_top_a_->num(), 5);
-  EXPECT_EQ(this->blob_top_a_->channels(), 3);
-  EXPECT_EQ(this->blob_top_a_->height(), 2);
-  EXPECT_EQ(this->blob_top_a_->width(), 4);
-  EXPECT_EQ(this->blob_top_b_->count(), 0);
-  EXPECT_EQ(this->blob_top_c_->count(), 0);
-  for (int i = 0; i < this->blob_top_vec_.size(); ++i) {
-    for (int j = 0; j < this->blob_top_vec_[i]->count(); ++j) {
-      EXPECT_EQ(0, this->blob_top_vec_[i]->cpu_data()[j]);
-    }
-  }
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  for (int i = 0; i < this->blob_top_vec_.size(); ++i) {
-    for (int j = 0; j < this->blob_top_vec_[i]->count(); ++j) {
-      EXPECT_EQ(0, this->blob_top_vec_[i]->cpu_data()[j]);
-    }
-  }
-}
-
-TYPED_TEST(DummyDataLayerTest, TestTwoTopConstant) {
-  LayerParameter param;
-  DummyDataParameter* dummy_data_param = param.mutable_dummy_data_param();
-  dummy_data_param->add_num(5);
-  dummy_data_param->add_channels(3);
-  dummy_data_param->add_height(2);
-  dummy_data_param->add_width(4);
-  dummy_data_param->add_num(5);
-  // Don't explicitly set number of channels or height for 2nd top blob; should
-  // default to first channels and height (as we check later).
-  dummy_data_param->add_height(1);
-  FillerParameter* data_filler_param = dummy_data_param->add_data_filler();
-  data_filler_param->set_value(7);
-  this->blob_top_vec_.resize(2);
-  DummyDataLayer<TypeParam> layer(param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  EXPECT_EQ(this->blob_top_a_->num(), 5);
-  EXPECT_EQ(this->blob_top_a_->channels(), 3);
-  EXPECT_EQ(this->blob_top_a_->height(), 2);
-  EXPECT_EQ(this->blob_top_a_->width(), 4);
-  EXPECT_EQ(this->blob_top_b_->num(), 5);
-  EXPECT_EQ(this->blob_top_b_->channels(), 3);
-  EXPECT_EQ(this->blob_top_b_->height(), 1);
-  EXPECT_EQ(this->blob_top_b_->width(), 4);
-  EXPECT_EQ(this->blob_top_c_->count(), 0);
-  for (int i = 0; i < this->blob_top_vec_.size(); ++i) {
-    for (int j = 0; j < this->blob_top_vec_[i]->count(); ++j) {
-      EXPECT_EQ(7, this->blob_top_vec_[i]->cpu_data()[j]);
-    }
-  }
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  for (int i = 0; i < this->blob_top_vec_.size(); ++i) {
-    for (int j = 0; j < this->blob_top_vec_[i]->count(); ++j) {
-      EXPECT_EQ(7, this->blob_top_vec_[i]->cpu_data()[j]);
-    }
-  }
-}
-
-TYPED_TEST(DummyDataLayerTest, TestThreeTopConstantGaussianConstant) {
-  LayerParameter param;
-  DummyDataParameter* dummy_data_param = param.mutable_dummy_data_param();
-  dummy_data_param->add_num(5);
-  dummy_data_param->add_channels(3);
-  dummy_data_param->add_height(2);
-  dummy_data_param->add_width(4);
-  FillerParameter* data_filler_param_a = dummy_data_param->add_data_filler();
-  data_filler_param_a->set_value(7);
-  FillerParameter* data_filler_param_b = dummy_data_param->add_data_filler();
-  data_filler_param_b->set_type("gaussian");
-  TypeParam gaussian_mean = 3.0;
-  TypeParam gaussian_std = 0.01;
-  data_filler_param_b->set_mean(gaussian_mean);
-  data_filler_param_b->set_std(gaussian_std);
-  FillerParameter* data_filler_param_c = dummy_data_param->add_data_filler();
-  data_filler_param_c->set_value(9);
-  DummyDataLayer<TypeParam> layer(param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  EXPECT_EQ(this->blob_top_a_->num(), 5);
-  EXPECT_EQ(this->blob_top_a_->channels(), 3);
-  EXPECT_EQ(this->blob_top_a_->height(), 2);
-  EXPECT_EQ(this->blob_top_a_->width(), 4);
-  EXPECT_EQ(this->blob_top_b_->num(), 5);
-  EXPECT_EQ(this->blob_top_b_->channels(), 3);
-  EXPECT_EQ(this->blob_top_b_->height(), 2);
-  EXPECT_EQ(this->blob_top_b_->width(), 4);
-  EXPECT_EQ(this->blob_top_c_->num(), 5);
-  EXPECT_EQ(this->blob_top_c_->channels(), 3);
-  EXPECT_EQ(this->blob_top_c_->height(), 2);
-  EXPECT_EQ(this->blob_top_c_->width(), 4);
-  for (int i = 0; i < this->blob_top_a_->count(); ++i) {
-    EXPECT_EQ(7, this->blob_top_a_->cpu_data()[i]);
-  }
-  // Blob b uses a Gaussian filler, so SetUp should not have initialized it.
-  // Blob b's data should therefore be the default Blob data value: 0.
-  for (int i = 0; i < this->blob_top_b_->count(); ++i) {
-    EXPECT_EQ(0, this->blob_top_b_->cpu_data()[i]);
-  }
-  for (int i = 0; i < this->blob_top_c_->count(); ++i) {
-    EXPECT_EQ(9, this->blob_top_c_->cpu_data()[i]);
-  }
-
-  // Do a Forward pass to fill in Blob b with Gaussian data.
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  for (int i = 0; i < this->blob_top_a_->count(); ++i) {
-    EXPECT_EQ(7, this->blob_top_a_->cpu_data()[i]);
-  }
-  // Check that the Gaussian's data has been filled in with values within
-  // 10 standard deviations of the mean. Record the first and last sample.
-  // to check that they're different after the next Forward pass.
-  for (int i = 0; i < this->blob_top_b_->count(); ++i) {
-    EXPECT_NEAR(gaussian_mean, this->blob_top_b_->cpu_data()[i],
-                gaussian_std * 10);
-  }
-  const TypeParam first_gaussian_sample = this->blob_top_b_->cpu_data()[0];
-  const TypeParam last_gaussian_sample =
-      this->blob_top_b_->cpu_data()[this->blob_top_b_->count() - 1];
-  for (int i = 0; i < this->blob_top_c_->count(); ++i) {
-    EXPECT_EQ(9, this->blob_top_c_->cpu_data()[i]);
-  }
-
-  // Do another Forward pass to fill in Blob b with Gaussian data again,
-  // checking that we get different values.
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  for (int i = 0; i < this->blob_top_a_->count(); ++i) {
-    EXPECT_EQ(7, this->blob_top_a_->cpu_data()[i]);
-  }
-  for (int i = 0; i < this->blob_top_b_->count(); ++i) {
-    EXPECT_NEAR(gaussian_mean, this->blob_top_b_->cpu_data()[i],
-                gaussian_std * 10);
-  }
-  EXPECT_NE(first_gaussian_sample, this->blob_top_b_->cpu_data()[0]);
-  EXPECT_NE(last_gaussian_sample,
-      this->blob_top_b_->cpu_data()[this->blob_top_b_->count() - 1]);
-  for (int i = 0; i < this->blob_top_c_->count(); ++i) {
-    EXPECT_EQ(9, this->blob_top_c_->cpu_data()[i]);
-  }
-}
-
-}  // namespace caffe
diff --git a/src/caffe/test/test_eltwise_layer.cpp b/src/caffe/test/test_eltwise_layer.cpp
deleted file mode 100644
index c06e3ba..0000000
--- a/src/caffe/test/test_eltwise_layer.cpp
+++ /dev/null
@@ -1,209 +0,0 @@
-#include <algorithm>
-#include <vector>
-
-#include "gtest/gtest.h"
-
-#include "caffe/blob.hpp"
-#include "caffe/common.hpp"
-#include "caffe/filler.hpp"
-#include "caffe/layers/eltwise_layer.hpp"
-
-#include "caffe/test/test_caffe_main.hpp"
-#include "caffe/test/test_gradient_check_util.hpp"
-
-namespace caffe {
-
-template <typename TypeParam>
-class EltwiseLayerTest : public MultiDeviceTest<TypeParam> {
-  typedef typename TypeParam::Dtype Dtype;
-
- protected:
-  EltwiseLayerTest()
-      : blob_bottom_a_(new Blob<Dtype>(2, 3, 4, 5)),
-        blob_bottom_b_(new Blob<Dtype>(2, 3, 4, 5)),
-        blob_bottom_c_(new Blob<Dtype>(2, 3, 4, 5)),
-        blob_top_(new Blob<Dtype>()) {
-    // fill the values
-    Caffe::set_random_seed(1701);
-    FillerParameter filler_param;
-    UniformFiller<Dtype> filler(filler_param);
-    filler.Fill(this->blob_bottom_a_);
-    filler.Fill(this->blob_bottom_b_);
-    filler.Fill(this->blob_bottom_c_);
-    blob_bottom_vec_.push_back(blob_bottom_a_);
-    blob_bottom_vec_.push_back(blob_bottom_b_);
-    blob_bottom_vec_.push_back(blob_bottom_c_);
-    blob_top_vec_.push_back(blob_top_);
-  }
-  virtual ~EltwiseLayerTest() {
-    delete blob_bottom_a_;
-    delete blob_bottom_b_;
-    delete blob_bottom_c_;
-    delete blob_top_;
-  }
-  Blob<Dtype>* const blob_bottom_a_;
-  Blob<Dtype>* const blob_bottom_b_;
-  Blob<Dtype>* const blob_bottom_c_;
-  Blob<Dtype>* const blob_top_;
-  vector<Blob<Dtype>*> blob_bottom_vec_;
-  vector<Blob<Dtype>*> blob_top_vec_;
-};
-
-TYPED_TEST_CASE(EltwiseLayerTest, TestDtypesAndDevices);
-
-TYPED_TEST(EltwiseLayerTest, TestSetUp) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  EltwiseParameter* eltwise_param = layer_param.mutable_eltwise_param();
-  eltwise_param->set_operation(EltwiseParameter_EltwiseOp_PROD);
-  shared_ptr<EltwiseLayer<Dtype> > layer(
-      new EltwiseLayer<Dtype>(layer_param));
-  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  EXPECT_EQ(this->blob_top_->num(), 2);
-  EXPECT_EQ(this->blob_top_->channels(), 3);
-  EXPECT_EQ(this->blob_top_->height(), 4);
-  EXPECT_EQ(this->blob_top_->width(), 5);
-}
-
-TYPED_TEST(EltwiseLayerTest, TestProd) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  EltwiseParameter* eltwise_param = layer_param.mutable_eltwise_param();
-  eltwise_param->set_operation(EltwiseParameter_EltwiseOp_PROD);
-  shared_ptr<EltwiseLayer<Dtype> > layer(
-      new EltwiseLayer<Dtype>(layer_param));
-  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  const Dtype* data = this->blob_top_->cpu_data();
-  const int count = this->blob_top_->count();
-  const Dtype* in_data_a = this->blob_bottom_a_->cpu_data();
-  const Dtype* in_data_b = this->blob_bottom_b_->cpu_data();
-  const Dtype* in_data_c = this->blob_bottom_c_->cpu_data();
-  for (int i = 0; i < count; ++i) {
-    EXPECT_NEAR(data[i], in_data_a[i] * in_data_b[i] * in_data_c[i], 1e-4);
-  }
-}
-
-TYPED_TEST(EltwiseLayerTest, TestSum) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  EltwiseParameter* eltwise_param = layer_param.mutable_eltwise_param();
-  eltwise_param->set_operation(EltwiseParameter_EltwiseOp_SUM);
-  shared_ptr<EltwiseLayer<Dtype> > layer(
-      new EltwiseLayer<Dtype>(layer_param));
-  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  const Dtype* data = this->blob_top_->cpu_data();
-  const int count = this->blob_top_->count();
-  const Dtype* in_data_a = this->blob_bottom_a_->cpu_data();
-  const Dtype* in_data_b = this->blob_bottom_b_->cpu_data();
-  const Dtype* in_data_c = this->blob_bottom_c_->cpu_data();
-  for (int i = 0; i < count; ++i) {
-    EXPECT_NEAR(data[i], in_data_a[i] + in_data_b[i] + in_data_c[i], 1e-4);
-  }
-}
-
-TYPED_TEST(EltwiseLayerTest, TestSumCoeff) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  EltwiseParameter* eltwise_param = layer_param.mutable_eltwise_param();
-  eltwise_param->set_operation(EltwiseParameter_EltwiseOp_SUM);
-  eltwise_param->add_coeff(1);
-  eltwise_param->add_coeff(-0.5);
-  eltwise_param->add_coeff(2);
-  shared_ptr<EltwiseLayer<Dtype> > layer(
-      new EltwiseLayer<Dtype>(layer_param));
-  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  const Dtype* data = this->blob_top_->cpu_data();
-  const int count = this->blob_top_->count();
-  const Dtype* in_data_a = this->blob_bottom_a_->cpu_data();
-  const Dtype* in_data_b = this->blob_bottom_b_->cpu_data();
-  const Dtype* in_data_c = this->blob_bottom_c_->cpu_data();
-  for (int i = 0; i < count; ++i) {
-    EXPECT_NEAR(data[i], in_data_a[i] - 0.5*in_data_b[i] + 2*in_data_c[i],
-        1e-4);
-  }
-}
-
-TYPED_TEST(EltwiseLayerTest, TestStableProdGradient) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  EltwiseParameter* eltwise_param = layer_param.mutable_eltwise_param();
-  eltwise_param->set_operation(EltwiseParameter_EltwiseOp_PROD);
-  eltwise_param->set_stable_prod_grad(true);
-  EltwiseLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-2, 1e-3);
-  checker.CheckGradientEltwise(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_);
-}
-
-TYPED_TEST(EltwiseLayerTest, TestUnstableProdGradient) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  EltwiseParameter* eltwise_param = layer_param.mutable_eltwise_param();
-  eltwise_param->set_operation(EltwiseParameter_EltwiseOp_PROD);
-  eltwise_param->set_stable_prod_grad(false);
-  EltwiseLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-2, 1e-3);
-  checker.CheckGradientEltwise(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_);
-}
-
-TYPED_TEST(EltwiseLayerTest, TestSumGradient) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  EltwiseParameter* eltwise_param = layer_param.mutable_eltwise_param();
-  eltwise_param->set_operation(EltwiseParameter_EltwiseOp_SUM);
-  EltwiseLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-2, 1e-3);
-  checker.CheckGradientEltwise(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_);
-}
-
-TYPED_TEST(EltwiseLayerTest, TestSumCoeffGradient) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  EltwiseParameter* eltwise_param = layer_param.mutable_eltwise_param();
-  eltwise_param->set_operation(EltwiseParameter_EltwiseOp_SUM);
-  eltwise_param->add_coeff(1);
-  eltwise_param->add_coeff(-0.5);
-  eltwise_param->add_coeff(2);
-  EltwiseLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-2, 1e-3);
-  checker.CheckGradientEltwise(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_);
-}
-
-TYPED_TEST(EltwiseLayerTest, TestMax) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  EltwiseParameter* eltwise_param = layer_param.mutable_eltwise_param();
-  eltwise_param->set_operation(EltwiseParameter_EltwiseOp_MAX);
-  shared_ptr<EltwiseLayer<Dtype> > layer(
-      new EltwiseLayer<Dtype>(layer_param));
-  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  const Dtype* data = this->blob_top_->cpu_data();
-  const int count = this->blob_top_->count();
-  const Dtype* in_data_a = this->blob_bottom_a_->cpu_data();
-  const Dtype* in_data_b = this->blob_bottom_b_->cpu_data();
-  const Dtype* in_data_c = this->blob_bottom_c_->cpu_data();
-  for (int i = 0; i < count; ++i) {
-    EXPECT_EQ(data[i],
-              std::max(in_data_a[i], std::max(in_data_b[i], in_data_c[i])));
-  }
-}
-
-TYPED_TEST(EltwiseLayerTest, TestMaxGradient) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  EltwiseParameter* eltwise_param = layer_param.mutable_eltwise_param();
-  eltwise_param->set_operation(EltwiseParameter_EltwiseOp_MAX);
-  EltwiseLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-4, 1e-3);
-  checker.CheckGradientEltwise(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_);
-}
-
-}  // namespace caffe
diff --git a/src/caffe/test/test_embed_layer.cpp b/src/caffe/test/test_embed_layer.cpp
deleted file mode 100644
index dc7f5c4..0000000
--- a/src/caffe/test/test_embed_layer.cpp
+++ /dev/null
@@ -1,178 +0,0 @@
-#include <vector>
-
-#include "gtest/gtest.h"
-
-#include "caffe/blob.hpp"
-#include "caffe/common.hpp"
-#include "caffe/filler.hpp"
-#include "caffe/layers/embed_layer.hpp"
-
-#include "caffe/test/test_caffe_main.hpp"
-#include "caffe/test/test_gradient_check_util.hpp"
-
-namespace caffe {
-
-template <typename TypeParam>
-class EmbedLayerTest : public MultiDeviceTest<TypeParam> {
-  typedef typename TypeParam::Dtype Dtype;
- protected:
-  EmbedLayerTest()
-      : blob_bottom_(new Blob<Dtype>(4, 1, 1, 1)),
-        blob_top_(new Blob<Dtype>()) {
-    // fill the values
-    FillerParameter filler_param;
-    UniformFiller<Dtype> filler(filler_param);
-    filler.Fill(this->blob_bottom_);
-    blob_bottom_vec_.push_back(blob_bottom_);
-    blob_top_vec_.push_back(blob_top_);
-  }
-  virtual ~EmbedLayerTest() { delete blob_bottom_; delete blob_top_; }
-  Blob<Dtype>* const blob_bottom_;
-  Blob<Dtype>* const blob_top_;
-  vector<Blob<Dtype>*> blob_bottom_vec_;
-  vector<Blob<Dtype>*> blob_top_vec_;
-};
-
-TYPED_TEST_CASE(EmbedLayerTest, TestDtypesAndDevices);
-
-TYPED_TEST(EmbedLayerTest, TestSetUp) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  EmbedParameter* embed_param = layer_param.mutable_embed_param();
-  embed_param->set_num_output(10);
-  embed_param->set_input_dim(5);
-  shared_ptr<EmbedLayer<Dtype> > layer(new EmbedLayer<Dtype>(layer_param));
-  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  ASSERT_EQ(this->blob_top_->num_axes(), 5);
-  EXPECT_EQ(this->blob_top_->shape(0), 4);
-  EXPECT_EQ(this->blob_top_->shape(1), 1);
-  EXPECT_EQ(this->blob_top_->shape(2), 1);
-  EXPECT_EQ(this->blob_top_->shape(3), 1);
-  EXPECT_EQ(this->blob_top_->shape(4), 10);
-}
-
-TYPED_TEST(EmbedLayerTest, TestForward) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  EmbedParameter* embed_param = layer_param.mutable_embed_param();
-  const int kNumOutput = 10;
-  const int kInputDim = 5;
-  embed_param->set_num_output(kNumOutput);
-  embed_param->set_input_dim(kInputDim);
-  embed_param->mutable_weight_filler()->set_type("uniform");
-  embed_param->mutable_weight_filler()->set_min(-10);
-  embed_param->mutable_weight_filler()->set_max(10);
-  embed_param->set_bias_term(false);
-  shared_ptr<EmbedLayer<Dtype> > layer(new EmbedLayer<Dtype>(layer_param));
-  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  ASSERT_EQ(1, layer->blobs().size());
-  vector<int> weight_shape(2);
-  weight_shape[0] = kInputDim;
-  weight_shape[1] = kNumOutput;
-  ASSERT_TRUE(weight_shape == layer->blobs()[0]->shape());
-  for (int i = 0; i < this->blob_bottom_->count(); ++i) {
-    this->blob_bottom_->mutable_cpu_data()[i] = caffe_rng_rand() % kInputDim;
-  }
-  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  vector<int> weight_offset(2, 0);
-  vector<int> top_offset(5, 0);
-  for (int i = 0; i < this->blob_bottom_->count(); ++i) {
-    weight_offset[0] = static_cast<int>(this->blob_bottom_->cpu_data()[i]);
-    weight_offset[1] = 0;
-    top_offset[0] = i;
-    top_offset[4] = 0;
-    for (int j = 0; j < kNumOutput; ++j) {
-      EXPECT_EQ(layer->blobs()[0]->data_at(weight_offset),
-                this->blob_top_->data_at(top_offset));
-      ++top_offset[4];
-      ++weight_offset[1];
-    }
-  }
-}
-
-TYPED_TEST(EmbedLayerTest, TestForwardWithBias) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  EmbedParameter* embed_param = layer_param.mutable_embed_param();
-  const int kNumOutput = 10;
-  const int kInputDim = 5;
-  embed_param->set_num_output(kNumOutput);
-  embed_param->set_input_dim(kInputDim);
-  embed_param->mutable_weight_filler()->set_type("uniform");
-  embed_param->mutable_weight_filler()->set_min(-10);
-  embed_param->mutable_weight_filler()->set_max(10);
-  embed_param->mutable_bias_filler()->CopyFrom(embed_param->weight_filler());
-  embed_param->set_bias_term(true);
-  shared_ptr<EmbedLayer<Dtype> > layer(new EmbedLayer<Dtype>(layer_param));
-  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  ASSERT_EQ(2, layer->blobs().size());
-  vector<int> weight_shape(2);
-  weight_shape[0] = kInputDim;
-  weight_shape[1] = kNumOutput;
-  ASSERT_TRUE(weight_shape == layer->blobs()[0]->shape());
-  for (int i = 0; i < this->blob_bottom_->count(); ++i) {
-    this->blob_bottom_->mutable_cpu_data()[i] = caffe_rng_rand() % kInputDim;
-  }
-  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  vector<int> bias_offset(1, 0);
-  vector<int> weight_offset(2, 0);
-  vector<int> top_offset(5, 0);
-  for (int i = 0; i < this->blob_bottom_->count(); ++i) {
-    weight_offset[0] = static_cast<int>(this->blob_bottom_->cpu_data()[i]);
-    weight_offset[1] = 0;
-    top_offset[0] = i;
-    top_offset[4] = 0;
-    bias_offset[0] = 0;
-    for (int j = 0; j < kNumOutput; ++j) {
-      EXPECT_EQ(layer->blobs()[0]->data_at(weight_offset) +
-                layer->blobs()[1]->data_at(bias_offset),
-                this->blob_top_->data_at(top_offset));
-      ++top_offset[4];
-      ++weight_offset[1];
-      ++bias_offset[0];
-    }
-  }
-}
-
-TYPED_TEST(EmbedLayerTest, TestGradient) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  EmbedParameter* embed_param = layer_param.mutable_embed_param();
-  embed_param->set_num_output(10);
-  embed_param->set_input_dim(5);
-  embed_param->set_bias_term(false);
-  embed_param->mutable_weight_filler()->set_type("uniform");
-  embed_param->mutable_weight_filler()->set_min(-10);
-  embed_param->mutable_weight_filler()->set_max(10);
-  EmbedLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-2, 1e-3);
-  this->blob_bottom_->mutable_cpu_data()[0] = 4;
-  this->blob_bottom_->mutable_cpu_data()[1] = 2;
-  this->blob_bottom_->mutable_cpu_data()[2] = 2;
-  this->blob_bottom_->mutable_cpu_data()[3] = 3;
-  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_, -2);
-}
-
-TYPED_TEST(EmbedLayerTest, TestGradientWithBias) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  EmbedParameter* embed_param = layer_param.mutable_embed_param();
-  embed_param->set_num_output(10);
-  embed_param->set_input_dim(5);
-  embed_param->set_bias_term(true);
-  embed_param->mutable_weight_filler()->set_type("uniform");
-  embed_param->mutable_weight_filler()->set_min(-10);
-  embed_param->mutable_weight_filler()->set_max(10);
-  embed_param->mutable_bias_filler()->CopyFrom(embed_param->weight_filler());
-  EmbedLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-2, 1e-3);
-  this->blob_bottom_->mutable_cpu_data()[0] = 4;
-  this->blob_bottom_->mutable_cpu_data()[1] = 2;
-  this->blob_bottom_->mutable_cpu_data()[2] = 2;
-  this->blob_bottom_->mutable_cpu_data()[3] = 3;
-  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_, -2);
-}
-
-}  // namespace caffe
diff --git a/src/caffe/test/test_euclidean_loss_layer.cpp b/src/caffe/test/test_euclidean_loss_layer.cpp
deleted file mode 100644
index f253f9f..0000000
--- a/src/caffe/test/test_euclidean_loss_layer.cpp
+++ /dev/null
@@ -1,89 +0,0 @@
-#include <cmath>
-#include <vector>
-
-#include "gtest/gtest.h"
-
-#include "caffe/blob.hpp"
-#include "caffe/common.hpp"
-#include "caffe/filler.hpp"
-#include "caffe/layers/euclidean_loss_layer.hpp"
-
-#include "caffe/test/test_caffe_main.hpp"
-#include "caffe/test/test_gradient_check_util.hpp"
-
-namespace caffe {
-
-template <typename TypeParam>
-class EuclideanLossLayerTest : public MultiDeviceTest<TypeParam> {
-  typedef typename TypeParam::Dtype Dtype;
-
- protected:
-  EuclideanLossLayerTest()
-      : blob_bottom_data_(new Blob<Dtype>(10, 5, 1, 1)),
-        blob_bottom_label_(new Blob<Dtype>(10, 5, 1, 1)),
-        blob_top_loss_(new Blob<Dtype>()) {
-    // fill the values
-    FillerParameter filler_param;
-    GaussianFiller<Dtype> filler(filler_param);
-    filler.Fill(this->blob_bottom_data_);
-    blob_bottom_vec_.push_back(blob_bottom_data_);
-    filler.Fill(this->blob_bottom_label_);
-    blob_bottom_vec_.push_back(blob_bottom_label_);
-    blob_top_vec_.push_back(blob_top_loss_);
-  }
-  virtual ~EuclideanLossLayerTest() {
-    delete blob_bottom_data_;
-    delete blob_bottom_label_;
-    delete blob_top_loss_;
-  }
-
-  void TestForward() {
-    // Get the loss without a specified objective weight -- should be
-    // equivalent to explicitly specifiying a weight of 1.
-    LayerParameter layer_param;
-    EuclideanLossLayer<Dtype> layer_weight_1(layer_param);
-    layer_weight_1.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-    const Dtype loss_weight_1 =
-        layer_weight_1.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-
-    // Get the loss again with a different objective weight; check that it is
-    // scaled appropriately.
-    const Dtype kLossWeight = 3.7;
-    layer_param.add_loss_weight(kLossWeight);
-    EuclideanLossLayer<Dtype> layer_weight_2(layer_param);
-    layer_weight_2.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-    const Dtype loss_weight_2 =
-        layer_weight_2.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-    const Dtype kErrorMargin = 1e-5;
-    EXPECT_NEAR(loss_weight_1 * kLossWeight, loss_weight_2, kErrorMargin);
-    // Make sure the loss is non-trivial.
-    const Dtype kNonTrivialAbsThresh = 1e-1;
-    EXPECT_GE(fabs(loss_weight_1), kNonTrivialAbsThresh);
-  }
-
-  Blob<Dtype>* const blob_bottom_data_;
-  Blob<Dtype>* const blob_bottom_label_;
-  Blob<Dtype>* const blob_top_loss_;
-  vector<Blob<Dtype>*> blob_bottom_vec_;
-  vector<Blob<Dtype>*> blob_top_vec_;
-};
-
-TYPED_TEST_CASE(EuclideanLossLayerTest, TestDtypesAndDevices);
-
-TYPED_TEST(EuclideanLossLayerTest, TestForward) {
-  this->TestForward();
-}
-
-TYPED_TEST(EuclideanLossLayerTest, TestGradient) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  const Dtype kLossWeight = 3.7;
-  layer_param.add_loss_weight(kLossWeight);
-  EuclideanLossLayer<Dtype> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  GradientChecker<Dtype> checker(1e-2, 1e-2, 1701);
-  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_);
-}
-
-}  // namespace caffe
diff --git a/src/caffe/test/test_filler.cpp b/src/caffe/test/test_filler.cpp
deleted file mode 100644
index 26e9b21..0000000
--- a/src/caffe/test/test_filler.cpp
+++ /dev/null
@@ -1,241 +0,0 @@
-#include "gtest/gtest.h"
-
-#include "caffe/filler.hpp"
-
-#include "caffe/test/test_caffe_main.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-class ConstantFillerTest : public ::testing::Test {
- protected:
-  ConstantFillerTest()
-      : blob_(new Blob<Dtype>(2, 3, 4, 5)),
-        filler_param_() {
-    filler_param_.set_value(10.);
-    filler_.reset(new ConstantFiller<Dtype>(filler_param_));
-    filler_->Fill(blob_);
-  }
-  virtual ~ConstantFillerTest() { delete blob_; }
-  Blob<Dtype>* const blob_;
-  FillerParameter filler_param_;
-  shared_ptr<ConstantFiller<Dtype> > filler_;
-};
-
-TYPED_TEST_CASE(ConstantFillerTest, TestDtypes);
-
-TYPED_TEST(ConstantFillerTest, TestFill) {
-  EXPECT_TRUE(this->blob_);
-  const int count = this->blob_->count();
-  const TypeParam* data = this->blob_->cpu_data();
-  for (int i = 0; i < count; ++i) {
-    EXPECT_GE(data[i], this->filler_param_.value());
-  }
-}
-
-
-template <typename Dtype>
-class UniformFillerTest : public ::testing::Test {
- protected:
-  UniformFillerTest()
-      : blob_(new Blob<Dtype>(2, 3, 4, 5)),
-        filler_param_() {
-    filler_param_.set_min(1.);
-    filler_param_.set_max(2.);
-    filler_.reset(new UniformFiller<Dtype>(filler_param_));
-    filler_->Fill(blob_);
-  }
-  virtual ~UniformFillerTest() { delete blob_; }
-  Blob<Dtype>* const blob_;
-  FillerParameter filler_param_;
-  shared_ptr<UniformFiller<Dtype> > filler_;
-};
-
-TYPED_TEST_CASE(UniformFillerTest, TestDtypes);
-
-TYPED_TEST(UniformFillerTest, TestFill) {
-  EXPECT_TRUE(this->blob_);
-  const int count = this->blob_->count();
-  const TypeParam* data = this->blob_->cpu_data();
-  for (int i = 0; i < count; ++i) {
-    EXPECT_GE(data[i], this->filler_param_.min());
-    EXPECT_LE(data[i], this->filler_param_.max());
-  }
-}
-
-template <typename Dtype>
-class PositiveUnitballFillerTest : public ::testing::Test {
- protected:
-  PositiveUnitballFillerTest()
-      : blob_(new Blob<Dtype>(2, 3, 4, 5)),
-        filler_param_() {
-    filler_.reset(new PositiveUnitballFiller<Dtype>(filler_param_));
-    filler_->Fill(blob_);
-  }
-  virtual ~PositiveUnitballFillerTest() { delete blob_; }
-  Blob<Dtype>* const blob_;
-  FillerParameter filler_param_;
-  shared_ptr<PositiveUnitballFiller<Dtype> > filler_;
-};
-
-TYPED_TEST_CASE(PositiveUnitballFillerTest, TestDtypes);
-
-TYPED_TEST(PositiveUnitballFillerTest, TestFill) {
-  EXPECT_TRUE(this->blob_);
-  const int num = this->blob_->num();
-  const int count = this->blob_->count();
-  const int dim = count / num;
-  const TypeParam* data = this->blob_->cpu_data();
-  for (int i = 0; i < count; ++i) {
-    EXPECT_GE(data[i], 0);
-    EXPECT_LE(data[i], 1);
-  }
-  for (int i = 0; i < num; ++i) {
-    TypeParam sum = 0;
-    for (int j = 0; j < dim; ++j) {
-      sum += data[i * dim + j];
-    }
-    EXPECT_GE(sum, 0.999);
-    EXPECT_LE(sum, 1.001);
-  }
-}
-
-template <typename Dtype>
-class GaussianFillerTest : public ::testing::Test {
- protected:
-  GaussianFillerTest()
-      : blob_(new Blob<Dtype>(2, 3, 4, 5)),
-        filler_param_() {
-    filler_param_.set_mean(10.);
-    filler_param_.set_std(0.1);
-    filler_.reset(new GaussianFiller<Dtype>(filler_param_));
-    filler_->Fill(blob_);
-  }
-  virtual ~GaussianFillerTest() { delete blob_; }
-  Blob<Dtype>* const blob_;
-  FillerParameter filler_param_;
-  shared_ptr<GaussianFiller<Dtype> > filler_;
-};
-
-TYPED_TEST_CASE(GaussianFillerTest, TestDtypes);
-
-TYPED_TEST(GaussianFillerTest, TestFill) {
-  EXPECT_TRUE(this->blob_);
-  const int count = this->blob_->count();
-  const TypeParam* data = this->blob_->cpu_data();
-  TypeParam mean = 0.;
-  TypeParam var = 0.;
-  for (int i = 0; i < count; ++i) {
-    mean += data[i];
-    var += (data[i] - this->filler_param_.mean()) *
-        (data[i] - this->filler_param_.mean());
-  }
-  mean /= count;
-  var /= count;
-  // Very loose test.
-  EXPECT_GE(mean, this->filler_param_.mean() - this->filler_param_.std() * 5);
-  EXPECT_LE(mean, this->filler_param_.mean() + this->filler_param_.std() * 5);
-  TypeParam target_var = this->filler_param_.std() * this->filler_param_.std();
-  EXPECT_GE(var, target_var / 5.);
-  EXPECT_LE(var, target_var * 5.);
-}
-
-template <typename Dtype>
-class XavierFillerTest : public ::testing::Test {
- protected:
-  XavierFillerTest()
-      : blob_(new Blob<Dtype>(1000, 2, 4, 5)),
-        filler_param_() {
-  }
-  virtual void test_params(FillerParameter_VarianceNorm variance_norm,
-      Dtype n) {
-    this->filler_param_.set_variance_norm(variance_norm);
-    this->filler_.reset(new XavierFiller<Dtype>(this->filler_param_));
-    this->filler_->Fill(blob_);
-    EXPECT_TRUE(this->blob_);
-    const int count = this->blob_->count();
-    const Dtype* data = this->blob_->cpu_data();
-    Dtype mean = 0.;
-    Dtype ex2 = 0.;
-    for (int i = 0; i < count; ++i) {
-      mean += data[i];
-      ex2 += data[i] * data[i];
-    }
-    mean /= count;
-    ex2 /= count;
-    Dtype std = sqrt(ex2 - mean*mean);
-    Dtype target_std = sqrt(2.0 / n);
-    EXPECT_NEAR(mean, 0.0, 0.1);
-    EXPECT_NEAR(std, target_std, 0.1);
-  }
-  virtual ~XavierFillerTest() { delete blob_; }
-  Blob<Dtype>* const blob_;
-  FillerParameter filler_param_;
-  shared_ptr<XavierFiller<Dtype> > filler_;
-};
-
-TYPED_TEST_CASE(XavierFillerTest, TestDtypes);
-
-TYPED_TEST(XavierFillerTest, TestFillFanIn) {
-  TypeParam n = 2*4*5;
-  this->test_params(FillerParameter_VarianceNorm_FAN_IN, n);
-}
-TYPED_TEST(XavierFillerTest, TestFillFanOut) {
-  TypeParam n = 1000*4*5;
-  this->test_params(FillerParameter_VarianceNorm_FAN_OUT, n);
-}
-TYPED_TEST(XavierFillerTest, TestFillAverage) {
-  TypeParam n = (2*4*5 + 1000*4*5) / 2.0;
-  this->test_params(FillerParameter_VarianceNorm_AVERAGE, n);
-}
-
-template <typename Dtype>
-class MSRAFillerTest : public ::testing::Test {
- protected:
-  MSRAFillerTest()
-      : blob_(new Blob<Dtype>(1000, 2, 4, 5)),
-        filler_param_() {
-  }
-  virtual void test_params(FillerParameter_VarianceNorm variance_norm,
-      Dtype n) {
-    this->filler_param_.set_variance_norm(variance_norm);
-    this->filler_.reset(new MSRAFiller<Dtype>(this->filler_param_));
-    this->filler_->Fill(blob_);
-    EXPECT_TRUE(this->blob_);
-    const int count = this->blob_->count();
-    const Dtype* data = this->blob_->cpu_data();
-    Dtype mean = 0.;
-    Dtype ex2 = 0.;
-    for (int i = 0; i < count; ++i) {
-      mean += data[i];
-      ex2 += data[i] * data[i];
-    }
-    mean /= count;
-    ex2 /= count;
-    Dtype std = sqrt(ex2 - mean*mean);
-    Dtype target_std = sqrt(2.0 / n);
-    EXPECT_NEAR(mean, 0.0, 0.1);
-    EXPECT_NEAR(std, target_std, 0.1);
-  }
-  virtual ~MSRAFillerTest() { delete blob_; }
-  Blob<Dtype>* const blob_;
-  FillerParameter filler_param_;
-  shared_ptr<MSRAFiller<Dtype> > filler_;
-};
-
-TYPED_TEST_CASE(MSRAFillerTest, TestDtypes);
-
-TYPED_TEST(MSRAFillerTest, TestFillFanIn) {
-  TypeParam n = 2*4*5;
-  this->test_params(FillerParameter_VarianceNorm_FAN_IN, n);
-}
-TYPED_TEST(MSRAFillerTest, TestFillFanOut) {
-  TypeParam n = 1000*4*5;
-  this->test_params(FillerParameter_VarianceNorm_FAN_OUT, n);
-}
-TYPED_TEST(MSRAFillerTest, TestFillAverage) {
-  TypeParam n = (2*4*5 + 1000*4*5) / 2.0;
-  this->test_params(FillerParameter_VarianceNorm_AVERAGE, n);
-}
-
-}  // namespace caffe
diff --git a/src/caffe/test/test_filter_layer.cpp b/src/caffe/test/test_filter_layer.cpp
deleted file mode 100644
index 9ea2b8b..0000000
--- a/src/caffe/test/test_filter_layer.cpp
+++ /dev/null
@@ -1,126 +0,0 @@
-#include <vector>
-
-#include "gtest/gtest.h"
-
-#include "caffe/blob.hpp"
-#include "caffe/common.hpp"
-#include "caffe/filler.hpp"
-#include "caffe/layers/filter_layer.hpp"
-
-#include "caffe/test/test_caffe_main.hpp"
-#include "caffe/test/test_gradient_check_util.hpp"
-
-namespace caffe {
-
-template <typename TypeParam>
-class FilterLayerTest : public MultiDeviceTest<TypeParam> {
-  typedef typename TypeParam::Dtype Dtype;
-
- protected:
-  FilterLayerTest()
-      : blob_bottom_data_(new Blob<Dtype>(4, 3, 6, 4)),
-        blob_bottom_labels_(new Blob<Dtype>(4, 1, 1, 1)),
-        blob_bottom_selector_(new Blob<Dtype>(4, 1, 1, 1)),
-        blob_top_data_(new Blob<Dtype>()),
-        blob_top_labels_(new Blob<Dtype>()) {}
-  virtual void SetUp() {
-    // fill the values
-    Caffe::set_random_seed(1890);
-    FillerParameter filler_param;
-    GaussianFiller<Dtype> filler(filler_param);
-    // fill the selector blob
-    Dtype* bottom_data_selector_ = blob_bottom_selector_->mutable_cpu_data();
-    bottom_data_selector_[0] = 0;
-    bottom_data_selector_[1] = 1;
-    bottom_data_selector_[2] = 1;
-    bottom_data_selector_[3] = 0;
-    // fill the other bottom blobs
-    filler.Fill(blob_bottom_data_);
-    for (int i = 0; i < blob_bottom_labels_->count(); ++i) {
-      blob_bottom_labels_->mutable_cpu_data()[i] = caffe_rng_rand() % 5;
-    }
-    blob_bottom_vec_.push_back(blob_bottom_data_);
-    blob_bottom_vec_.push_back(blob_bottom_labels_);
-    blob_bottom_vec_.push_back(blob_bottom_selector_);
-    blob_top_vec_.push_back(blob_top_data_);
-    blob_top_vec_.push_back(blob_top_labels_);
-  }
-  virtual ~FilterLayerTest() {
-    delete blob_bottom_data_;
-    delete blob_bottom_labels_;
-    delete blob_bottom_selector_;
-    delete blob_top_data_;
-    delete blob_top_labels_;
-  }
-  Blob<Dtype>* const blob_bottom_data_;
-  Blob<Dtype>* const blob_bottom_labels_;
-  Blob<Dtype>* const blob_bottom_selector_;
-  // blobs for the top of FilterLayer
-  Blob<Dtype>* const blob_top_data_;
-  Blob<Dtype>* const blob_top_labels_;
-  vector<Blob<Dtype>*> blob_bottom_vec_;
-  vector<Blob<Dtype>*> blob_top_vec_;
-};
-
-TYPED_TEST_CASE(FilterLayerTest, TestDtypesAndDevices);
-
-TYPED_TEST(FilterLayerTest, TestReshape) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  FilterLayer<Dtype> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer.Reshape(this->blob_bottom_vec_, this->blob_top_vec_);
-  // In the test first and last items should have been filtered
-  // so we just expect 2 remaining items
-  EXPECT_EQ(this->blob_top_data_->shape(0), 2);
-  EXPECT_EQ(this->blob_top_labels_->shape(0), 2);
-  EXPECT_GT(this->blob_bottom_data_->shape(0),
-      this->blob_top_data_->shape(0));
-  EXPECT_GT(this->blob_bottom_labels_->shape(0),
-      this->blob_top_labels_->shape(0));
-  for (int i = 1; i < this->blob_bottom_labels_->num_axes(); i++) {
-    EXPECT_EQ(this->blob_bottom_labels_->shape(i),
-        this->blob_top_labels_->shape(i));
-  }
-}
-
-TYPED_TEST(FilterLayerTest, TestForward) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  FilterLayer<Dtype> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer.Reshape(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  EXPECT_EQ(this->blob_top_labels_->data_at(0, 0, 0, 0),
-      this->blob_bottom_labels_->data_at(1, 0, 0, 0));
-  EXPECT_EQ(this->blob_top_labels_->data_at(1, 0, 0, 0),
-      this->blob_bottom_labels_->data_at(2, 0, 0, 0));
-
-  int dim = this->blob_top_data_->count() /
-      this->blob_top_data_->shape(0);
-  const Dtype* top_data = this->blob_top_data_->cpu_data();
-  const Dtype* bottom_data = this->blob_bottom_data_->cpu_data();
-  // selector is 0 1 1 0, so we need to compare bottom(1,c,h,w)
-  // with top(0,c,h,w) and bottom(2,c,h,w) with top(1,c,h,w)
-  bottom_data += dim;  // bottom(1,c,h,w)
-  for (size_t n = 0; n < dim; n++)
-    EXPECT_EQ(top_data[n], bottom_data[n]);
-
-  bottom_data += dim;  // bottom(2,c,h,w)
-  top_data += dim;  // top(1,c,h,w)
-  for (size_t n = 0; n < dim; n++)
-    EXPECT_EQ(top_data[n], bottom_data[n]);
-}
-
-TYPED_TEST(FilterLayerTest, TestGradient) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  FilterLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-2, 1e-3);
-  // check only input 0 (data) because labels and selector
-  // don't need backpropagation
-  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_, 0);
-}
-
-}  // namespace caffe
diff --git a/src/caffe/test/test_flatten_layer.cpp b/src/caffe/test/test_flatten_layer.cpp
deleted file mode 100644
index d929ac7..0000000
--- a/src/caffe/test/test_flatten_layer.cpp
+++ /dev/null
@@ -1,108 +0,0 @@
-#include <vector>
-
-#include "gtest/gtest.h"
-
-#include "caffe/blob.hpp"
-#include "caffe/common.hpp"
-#include "caffe/filler.hpp"
-#include "caffe/layers/flatten_layer.hpp"
-
-#include "caffe/test/test_caffe_main.hpp"
-#include "caffe/test/test_gradient_check_util.hpp"
-
-namespace caffe {
-
-template <typename TypeParam>
-class FlattenLayerTest : public MultiDeviceTest<TypeParam> {
-  typedef typename TypeParam::Dtype Dtype;
- protected:
-  FlattenLayerTest()
-      : blob_bottom_(new Blob<Dtype>(2, 3, 6, 5)),
-        blob_top_(new Blob<Dtype>()) {
-    Caffe::set_random_seed(1701);
-    // fill the values
-    FillerParameter filler_param;
-    GaussianFiller<Dtype> filler(filler_param);
-    filler.Fill(this->blob_bottom_);
-    blob_bottom_vec_.push_back(blob_bottom_);
-    blob_top_vec_.push_back(blob_top_);
-  }
-  virtual ~FlattenLayerTest() { delete blob_bottom_; delete blob_top_; }
-  Blob<Dtype>* const blob_bottom_;
-  Blob<Dtype>* const blob_top_;
-  vector<Blob<Dtype>*> blob_bottom_vec_;
-  vector<Blob<Dtype>*> blob_top_vec_;
-};
-
-TYPED_TEST_CASE(FlattenLayerTest, TestDtypesAndDevices);
-
-TYPED_TEST(FlattenLayerTest, TestSetup) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  FlattenLayer<Dtype> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  ASSERT_EQ(this->blob_top_->num_axes(), 2);
-  EXPECT_EQ(this->blob_top_->shape(0), 2);
-  EXPECT_EQ(this->blob_top_->shape(1), 3 * 6 * 5);
-}
-
-TYPED_TEST(FlattenLayerTest, TestSetupWithAxis) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  layer_param.mutable_flatten_param()->set_axis(2);
-  FlattenLayer<Dtype> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  ASSERT_EQ(this->blob_top_->num_axes(), 3);
-  EXPECT_EQ(this->blob_top_->shape(0), 2);
-  EXPECT_EQ(this->blob_top_->shape(1), 3);
-  EXPECT_EQ(this->blob_top_->shape(2), 6 * 5);
-}
-
-TYPED_TEST(FlattenLayerTest, TestSetupWithEndAxis) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  layer_param.mutable_flatten_param()->set_end_axis(-2);
-  FlattenLayer<Dtype> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  ASSERT_EQ(this->blob_top_->num_axes(), 3);
-  EXPECT_EQ(this->blob_top_->shape(0), 2);
-  EXPECT_EQ(this->blob_top_->shape(1), 3 * 6);
-  EXPECT_EQ(this->blob_top_->shape(2), 5);
-}
-
-TYPED_TEST(FlattenLayerTest, TestSetupWithStartAndEndAxis) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  layer_param.mutable_flatten_param()->set_axis(0);
-  layer_param.mutable_flatten_param()->set_end_axis(-2);
-  FlattenLayer<Dtype> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  ASSERT_EQ(this->blob_top_->num_axes(), 2);
-  EXPECT_EQ(this->blob_top_->shape(0), 2 * 3 * 6);
-  EXPECT_EQ(this->blob_top_->shape(1), 5);
-}
-
-TYPED_TEST(FlattenLayerTest, TestForward) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  FlattenLayer<Dtype> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  for (int c = 0; c < 3 * 6 * 5; ++c) {
-    EXPECT_EQ(this->blob_top_->data_at(0, c, 0, 0),
-        this->blob_bottom_->data_at(0, c / (6 * 5), (c / 5) % 6, c % 5));
-    EXPECT_EQ(this->blob_top_->data_at(1, c, 0, 0),
-        this->blob_bottom_->data_at(1, c / (6 * 5), (c / 5) % 6, c % 5));
-  }
-}
-
-TYPED_TEST(FlattenLayerTest, TestGradient) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  FlattenLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-2, 1e-2);
-  checker.CheckGradientEltwise(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_);
-}
-
-}  // namespace caffe
diff --git a/src/caffe/test/test_gradient_based_solver.cpp b/src/caffe/test/test_gradient_based_solver.cpp
deleted file mode 100644
index 975a8f0..0000000
--- a/src/caffe/test/test_gradient_based_solver.cpp
+++ /dev/null
@@ -1,1265 +0,0 @@
-#include <algorithm>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "google/protobuf/text_format.h"
-
-#include "gtest/gtest.h"
-
-#include "caffe/common.hpp"
-#include "caffe/parallel.hpp"
-#include "caffe/proto/caffe.pb.h"
-#include "caffe/sgd_solvers.hpp"
-#include "caffe/util/io.hpp"
-
-#include "caffe/test/test_caffe_main.hpp"
-
-using std::ostringstream;
-
-namespace caffe {
-
-template <typename TypeParam>
-class GradientBasedSolverTest : public MultiDeviceTest<TypeParam> {
-  typedef typename TypeParam::Dtype Dtype;
-
- protected:
-  GradientBasedSolverTest() :
-      seed_(1701), num_(4), channels_(3), height_(10), width_(10),
-      share_(false) {
-        input_file_ = new string(
-        CMAKE_SOURCE_DIR "caffe/test/test_data/solver_data_list.txt" CMAKE_EXT);
-      }
-  ~GradientBasedSolverTest() {
-    delete input_file_;
-  }
-
-  string snapshot_prefix_;
-  shared_ptr<SGDSolver<Dtype> > solver_;
-  shared_ptr<P2PSync<Dtype> > sync_;
-  int seed_;
-  // Dimensions are determined by generate_sample_data.py
-  // TODO this is brittle and the hdf5 file should be checked instead.
-  int num_, channels_, height_, width_;
-  bool share_;
-  Dtype delta_;  // Stability constant for RMSProp, AdaGrad, AdaDelta and Adam
-
-  // Test data: check out generate_sample_data.py in the same directory.
-  string* input_file_;
-
-  virtual void InitSolver(const SolverParameter& param) = 0;
-
-  virtual void InitSolverFromProtoString(const string& proto) {
-    SolverParameter param;
-    CHECK(google::protobuf::TextFormat::ParseFromString(proto, &param));
-    // Set the solver_mode according to current Caffe::mode.
-    switch (Caffe::mode()) {
-      case Caffe::CPU:
-        param.set_solver_mode(SolverParameter_SolverMode_CPU);
-        break;
-      case Caffe::GPU:
-        param.set_solver_mode(SolverParameter_SolverMode_GPU);
-        break;
-      default:
-        LOG(FATAL) << "Unknown Caffe mode: " << Caffe::mode();
-    }
-    InitSolver(param);
-    delta_ = param.delta();
-  }
-
-  string RunLeastSquaresSolver(const Dtype learning_rate,
-      const Dtype weight_decay, const Dtype momentum, const int num_iters,
-      const int iter_size = 1, const int devices = 1,
-      const bool snapshot = false, const char* from_snapshot = NULL) {
-    ostringstream proto;
-    int device_id = 0;
-#ifndef CPU_ONLY
-    if (Caffe::mode() == Caffe::GPU) {
-      CUDA_CHECK(cudaGetDevice(&device_id));
-    }
-#endif
-    proto <<
-       "snapshot_after_train: " << snapshot << " "
-       "max_iter: " << num_iters << " "
-       "base_lr: " << learning_rate << " "
-       "lr_policy: 'fixed' "
-       "iter_size: " << iter_size << " "
-       "device_id: " << device_id << " "
-       "net_param { "
-       "  name: 'TestNetwork' "
-       "  layer { "
-       "    name: 'data' "
-       "    type: 'HDF5Data' "
-       "    hdf5_data_param { "
-       "      source: '" << *(this->input_file_) << "' "
-       "      batch_size: " << num_ / iter_size << " "
-       "    } "
-       "    top: 'data' "
-       "    top: 'targets' "
-       "  } ";
-    if (share_) {
-      proto <<
-         "  layer { "
-         "    name: 'slice' "
-         "    type: 'Slice' "
-         "    bottom: 'data' "
-         "    top: 'data1' "
-         "    top: 'data2' "
-         "    slice_param { "
-         "      axis: 0 "
-         "    } "
-         "  } ";
-    }
-    proto <<
-       "  layer { "
-       "    name: 'innerprod' "
-       "    type: 'InnerProduct' "
-       "    param { name: 'weights' } "
-       "    param { name: 'bias' } "
-       "    inner_product_param { "
-       "      num_output: 1 "
-       "      weight_filler { "
-       "        type: 'gaussian' "
-       "        std: 1.0 "
-       "      } "
-       "      bias_filler { "
-       "        type: 'gaussian' "
-       "        std: 1.0 "
-       "      } "
-       "    } "
-       "    bottom: '" << string(share_ ? "data1": "data") << "' "
-       "    top: '" << string(share_ ? "innerprod1": "innerprod") << "' "
-       "  } ";
-    if (share_) {
-      proto <<
-         "  layer { "
-         "    name: 'innerprod2' "
-         "    type: 'InnerProduct' "
-         "    param { name: 'weights' } "
-         "    param { name: 'bias' } "
-         "    inner_product_param { "
-         "      num_output: 1 "
-         "      weight_filler { "
-         "        type: 'gaussian' "
-         "        std: 1.0 "
-         "      } "
-         "      bias_filler { "
-         "        type: 'gaussian' "
-         "        std: 1.0 "
-         "      } "
-         "    } "
-         "    bottom: 'data2' "
-         "    top: 'innerprod2' "
-         "  } "
-         "  layer { "
-         "    name: 'concat' "
-         "    type: 'Concat' "
-         "    bottom: 'innerprod1' "
-         "    bottom: 'innerprod2' "
-         "    top: 'innerprod' "
-         "    concat_param { "
-         "      axis: 0 "
-         "    } "
-         "  } ";
-    }
-    proto <<
-       "  layer { "
-       "    name: 'loss' "
-       "    type: 'EuclideanLoss' "
-       "    bottom: 'innerprod' "
-       "    bottom: 'targets' "
-       "  } "
-       "} ";
-    if (weight_decay != 0) {
-      proto << "weight_decay: " << weight_decay << " ";
-    }
-    if (momentum != 0) {
-      proto << "momentum: " << momentum << " ";
-    }
-    MakeTempDir(&snapshot_prefix_);
-    proto << "snapshot_prefix: '" << snapshot_prefix_ << "/' ";
-    if (snapshot) {
-      proto << "snapshot: " << num_iters << " ";
-    }
-    Caffe::set_random_seed(this->seed_);
-    this->InitSolverFromProtoString(proto.str());
-    if (from_snapshot != NULL) {
-      this->solver_->Restore(from_snapshot);
-      for (int i = 0; i < this->solver_->iter(); ++i) {
-        this->solver_->net()->Forward();
-      }
-    }
-    if (devices == 1) {
-      this->solver_->Solve();
-    } else {
-      LOG(INFO) << "Multi-GPU test on " << devices << " devices";
-      vector<int> gpus;
-      // put current device at the beginning
-      int device_id = solver_->param().device_id();
-      gpus.push_back(device_id);
-      for (int i = 0; gpus.size() < devices; ++i) {
-        if (i != device_id)
-          gpus.push_back(i);
-      }
-      Caffe::set_solver_count(gpus.size());
-      this->sync_.reset(new P2PSync<Dtype>(
-          this->solver_, NULL, this->solver_->param()));
-      this->sync_->Run(gpus);
-      Caffe::set_solver_count(1);
-    }
-    if (snapshot) {
-      ostringstream resume_file;
-      resume_file << snapshot_prefix_ << "/_iter_" << num_iters
-                  << ".solverstate";
-      string resume_filename = resume_file.str();
-      return resume_filename;
-    }
-    return string();
-  }
-
-  // Compute an update value given the current state of the train net,
-  // using the analytical formula for the least squares gradient.
-  // updated_params will store the updated weight and bias results,
-  // using the blobs' diffs to hold the update values themselves.
-  void ComputeLeastSquaresUpdate(const Dtype learning_rate,
-      const Dtype weight_decay, const Dtype momentum, const int num_iters,
-      vector<shared_ptr<Blob<Dtype> > >* updated_params) {
-    const int N = num_;
-    const int D = channels_ * height_ * width_;
-
-    // Run a forward pass, and manually compute the update values from the
-    // result.
-    Net<Dtype>& net = *this->solver_->net();
-    net.Forward();
-    ASSERT_TRUE(net.has_blob("data"));
-    const Blob<Dtype>& data = *net.blob_by_name("data");
-    ASSERT_TRUE(net.has_blob("targets"));
-    const Blob<Dtype>& targets = *net.blob_by_name("targets");
-    ASSERT_TRUE(net.has_layer("innerprod"));
-    const vector<shared_ptr<Blob<Dtype> > >& param_blobs =
-        net.layer_by_name("innerprod")->blobs();
-    const int num_param_blobs = 2;
-    ASSERT_EQ(num_param_blobs, param_blobs.size());
-    const Blob<Dtype>& weights = *param_blobs[0];
-    const Blob<Dtype>& bias = *param_blobs[1];
-    ASSERT_EQ(D * N, data.count());
-    ASSERT_EQ(N, targets.count());
-    ASSERT_EQ(D, weights.count());
-    ASSERT_EQ(1, bias.count());
-
-    updated_params->clear();
-    updated_params->resize(num_param_blobs);
-    for (int i = 0; i < num_param_blobs; ++i) {
-      (*updated_params)[i].reset(new Blob<Dtype>());
-    }
-    Blob<Dtype>& updated_weights = *(*updated_params)[0];
-    updated_weights.ReshapeLike(weights);
-    Blob<Dtype>& updated_bias = *(*updated_params)[1];
-    updated_bias.ReshapeLike(bias);
-
-    for (int i = 0; i <= D; ++i) {
-      // Compute the derivative with respect to the ith weight (i.e., the ith
-      // element of the gradient).
-      Dtype grad = 0;
-      for (int j = 0; j <= D; ++j) {
-        // Compute element (i, j) of X^T * X.
-        Dtype element = 0;
-        for (int k = 0; k < N; ++k) {
-          // (i, k) in X^T (== (k, i) in X) times (k, j) in X.
-          const Dtype element_i = (i == D) ? 1 : data.cpu_data()[k * D + i];
-          const Dtype element_j = (j == D) ? 1 : data.cpu_data()[k * D + j];
-          element += element_i * element_j;
-        }
-        if (j == D) {
-          grad += element * bias.cpu_data()[0];
-        } else {
-          grad += element * weights.cpu_data()[j];
-        }
-      }
-      for (int k = 0; k < N; ++k) {
-        const Dtype element_i = (i == D) ? 1 : data.cpu_data()[k * D + i];
-        grad -= element_i * targets.cpu_data()[k];
-      }
-      // Scale the gradient over the N samples.
-      grad /= N;
-      // Add the weight decay to the gradient.
-      grad += weight_decay *
-          ((i == D) ? bias.cpu_data()[0] : weights.cpu_data()[i]);
-      // Finally, compute update.
-      const vector<shared_ptr<Blob<Dtype> > >& history = solver_->history();
-      if (solver_->type() != string("AdaDelta")
-          && solver_->type() != string("Adam")) {
-        ASSERT_EQ(2, history.size());  // 1 blob for weights, 1 for bias
-      } else {
-        ASSERT_EQ(4, history.size());  // additional blobs for update history
-      }
-      Dtype update_value = learning_rate * grad;
-      const Dtype history_value = (i == D) ?
-            history[1]->cpu_data()[0] : history[0]->cpu_data()[i];
-      const Dtype temp = momentum * history_value;
-      if (solver_->type() == string("SGD")) {
-        update_value += temp;
-      } else if (solver_->type() == string("Nesterov")) {
-        update_value += temp;
-        // step back then over-step
-        update_value = (1 + momentum) * update_value - temp;
-      } else if (solver_->type() == string("AdaGrad")) {
-        update_value /= std::sqrt(history_value + grad * grad) + delta_;
-      } else if (solver_->type() == string("RMSProp")) {
-        const Dtype rms_decay = 0.95;
-        update_value /= std::sqrt(rms_decay*history_value
-            + grad * grad * (1 - rms_decay)) + delta_;
-      } else if (solver_->type() == string("AdaDelta")) {
-        const Dtype update_history_value = (i == D) ?
-            history[1 + num_param_blobs]->cpu_data()[0] :
-            history[0 + num_param_blobs]->cpu_data()[i];
-        const Dtype weighted_gradient_average =
-            momentum * history_value + (1 - momentum) * (grad * grad);
-        update_value = grad * std::sqrt((update_history_value + delta_) /
-            (weighted_gradient_average + delta_)) * learning_rate;
-        // not actually needed, just here for illustrative purposes
-        // const Dtype weighted_update_average =
-        //   momentum * update_history_value + (1 - momentum) * (update_value);
-      } else if (solver_->type() == string("Adam")) {
-        const Dtype momentum2 = 0.999;
-        const Dtype m = history_value;
-        const Dtype v = (i == D) ?
-            history[1 + num_param_blobs]->cpu_data()[0] :
-            history[0 + num_param_blobs]->cpu_data()[i];
-        const Dtype val_m = (1 - momentum) * grad + momentum * m;
-        const Dtype val_v = (1 - momentum2) * grad * grad + momentum2 * v;
-        Dtype alpha_t = learning_rate *
-            std::sqrt(Dtype(1) - pow(momentum2, num_iters)) /
-            (Dtype(1.) - pow(momentum, num_iters));
-        update_value = alpha_t * val_m / (std::sqrt(val_v) + delta_);
-      } else {
-        LOG(FATAL) << "Unknown solver type: " << solver_->type();
-      }
-      if (i == D) {
-        updated_bias.mutable_cpu_diff()[0] = update_value;
-        updated_bias.mutable_cpu_data()[0] = bias.cpu_data()[0] - update_value;
-      } else {
-        updated_weights.mutable_cpu_diff()[i] = update_value;
-        updated_weights.mutable_cpu_data()[i] =
-            weights.cpu_data()[i] - update_value;
-      }
-    }
-  }
-
-  void CheckLeastSquaresUpdate(
-      const vector<shared_ptr<Blob<Dtype> > >& updated_params) {
-    const int D = channels_ * height_ * width_;
-
-    const Blob<Dtype>& updated_weights = *updated_params[0];
-    const Blob<Dtype>& updated_bias = *updated_params[1];
-
-    Net<Dtype>& net = *this->solver_->net();
-    ASSERT_TRUE(net.has_layer("innerprod"));
-    const vector<shared_ptr<Blob<Dtype> > >& param_blobs =
-        net.layer_by_name("innerprod")->blobs();
-    ASSERT_EQ(2, param_blobs.size());
-    const Blob<Dtype>& solver_updated_weights = *param_blobs[0];
-    ASSERT_EQ(D, solver_updated_weights.count());
-    const double kPrecision = 1e-2;
-    const double kMinPrecision = 1e-7;
-    for (int i = 0; i < D; ++i) {
-      const Dtype expected_updated_weight = updated_weights.cpu_data()[i];
-      const Dtype solver_updated_weight = solver_updated_weights.cpu_data()[i];
-      const Dtype error_margin = std::max(kMinPrecision, kPrecision *
-          std::min(fabs(expected_updated_weight), fabs(solver_updated_weight)));
-      EXPECT_NEAR(expected_updated_weight, solver_updated_weight, error_margin);
-    }
-    const Blob<Dtype>& solver_updated_bias_blob = *param_blobs[1];
-    ASSERT_EQ(1, solver_updated_bias_blob.count());
-    const Dtype expected_updated_bias = updated_bias.cpu_data()[0];
-    const Dtype solver_updated_bias = solver_updated_bias_blob.cpu_data()[0];
-    const Dtype error_margin = std::max(kMinPrecision, kPrecision *
-          std::min(fabs(expected_updated_bias), fabs(solver_updated_bias)));
-    EXPECT_NEAR(expected_updated_bias, solver_updated_bias, error_margin);
-
-    // Check the solver's history -- should contain the previous update value.
-    if (solver_->type() == string("SGD")) {
-      const vector<shared_ptr<Blob<Dtype> > >& history = solver_->history();
-      ASSERT_EQ(2, history.size());
-      for (int i = 0; i < D; ++i) {
-        const Dtype expected_history = updated_weights.cpu_diff()[i];
-        const Dtype solver_history = history[0]->cpu_data()[i];
-        const Dtype error_margin_hist = std::max(kMinPrecision, kPrecision *
-            std::min(fabs(expected_history), fabs(solver_history)));
-        EXPECT_NEAR(expected_history, solver_history, error_margin_hist);
-      }
-      const Dtype expected_history = updated_bias.cpu_diff()[0];
-      const Dtype solver_history = history[1]->cpu_data()[0];
-      const Dtype error_margin_hist = std::max(kMinPrecision, kPrecision *
-          std::min(fabs(expected_history), fabs(solver_history)));
-      EXPECT_NEAR(expected_history, solver_history, error_margin_hist);
-    }
-  }
-
-  void CheckAccumulation(const Dtype kLearningRate, const Dtype kWeightDecay,
-      const Dtype kMomentum, const int kNumIters, const int kIterSize) {
-    const double kPrecision = 1e-2;
-    const double kMinPrecision = 1e-7;
-    // Solve without accumulation and save parameters.
-    this->RunLeastSquaresSolver(kLearningRate, kWeightDecay, kMomentum,
-        kNumIters);
-    // Save parameters for comparison.
-    Net<Dtype>& net = *this->solver_->net();
-    const vector<shared_ptr<Blob<Dtype> > >& param_blobs =
-        net.layer_by_name("innerprod")->blobs();
-    vector<shared_ptr<Blob<Dtype> > > noaccum_params(param_blobs.size());
-    for (int i = 0; i < param_blobs.size(); ++i) {
-      noaccum_params[i].reset(new Blob<Dtype>());
-      noaccum_params[i]->CopyFrom(*param_blobs[i], false, true);
-    }
-    // Solve by equivalent accumulation of gradients over divided batches.
-    this->RunLeastSquaresSolver(kLearningRate, kWeightDecay, kMomentum,
-        kNumIters, kIterSize);
-    Net<Dtype>& net_accum = *this->solver_->net();
-    const vector<shared_ptr<Blob<Dtype> > >& accum_params =
-        net_accum.layer_by_name("innerprod")->blobs();
-    // Compare accumulated parameters against no accumulation standard.
-    const int D = this->channels_ * this->height_ * this->width_;
-    for (int i = 0; i < D; ++i) {
-      const Dtype expected_param = noaccum_params[0]->cpu_data()[i];
-      const Dtype accum_param = accum_params[0]->cpu_data()[i];
-      const Dtype error_margin = std::max(kMinPrecision, kPrecision *
-          std::min(fabs(expected_param), fabs(accum_param)));
-      EXPECT_NEAR(expected_param, accum_param, error_margin);
-    }
-    ASSERT_EQ(1, accum_params[1]->count());
-    const Dtype expected_bias = noaccum_params[1]->cpu_data()[0];
-    const Dtype accum_bias = accum_params[1]->cpu_data()[0];
-    const Dtype error_margin = std::max(kMinPrecision, kPrecision *
-        std::min(fabs(expected_bias), fabs(accum_bias)));
-    EXPECT_NEAR(expected_bias, accum_bias, error_margin);
-  }
-
-  // Test that the correct update is computed for a regularized least squares
-  // problem:
-  //
-  //            E = (1/(2n)) || X w - y ||^2 + (lambda / 2) || w ||^2
-  //   \nabla_w E = (1/n) (X^T X w - X^T y) + lambda * w
-  //
-  // X \in R^{n x (d+1)} (each example is a row, (d+1)th element is always 1)
-  // w \in R^{(d+1) x 1} ((d+1)th element is the bias)
-  // y \in R^{n x 1}
-  // lambda is weight_decay
-  //
-  // TestLeastSquaresUpdate works "inductively", assuming that the solver
-  // correctly updates the net K (= iter_to_check) times, then given the history
-  // from the Kth update, we compute the (K+1)th update and check that it
-  // matches the solver's (K+1)th update.
-  void TestLeastSquaresUpdate(const Dtype learning_rate = 1.0,
-      const Dtype weight_decay = 0.0, const Dtype momentum = 0.0,
-      const int iter_to_check = 0) {
-    const int kNum = num_;
-    const int kIterSize = 1;
-    // Test over all numbers of devices.
-    int available_devices = 1;
-#ifndef CPU_ONLY
-    if (Caffe::mode() == Caffe::GPU) {
-      CUDA_CHECK(cudaGetDeviceCount(&available_devices));
-    }
-#endif
-    for (int devices = 1; devices <= available_devices; ++devices) {
-      // Configure batch size for single / multi device equivalence.
-      // Constant data is needed for multi device as for accumulation.
-      num_ = kNum * devices;
-
-      // Initialize the solver and run K (= iter_to_check) solver iterations
-      // (on single device).
-      RunLeastSquaresSolver(learning_rate, weight_decay, momentum,
-                            iter_to_check, kIterSize, 1);
-
-      // Compute the (K+1)th update using the analytic least squares gradient.
-      vector<shared_ptr<Blob<Dtype> > > updated_params;
-      ComputeLeastSquaresUpdate(learning_rate, weight_decay, momentum,
-          iter_to_check + 1, &updated_params);
-
-      // Reinitialize the solver and run K+1 solver iterations.
-      num_ = kNum;
-      RunLeastSquaresSolver(learning_rate, weight_decay, momentum,
-          iter_to_check + 1, kIterSize, devices);
-
-      // Check that the solver's solution matches ours.
-      CheckLeastSquaresUpdate(updated_params);
-    }
-  }
-
-  void TestSnapshot(const Dtype learning_rate = 1.0,
-      const Dtype weight_decay = 0.0, const Dtype momentum = 0.0,
-      const int num_iters = 1) {
-    // Run the solver for num_iters * 2 iterations.
-    const int total_num_iters = num_iters * 2;
-    bool snapshot = false;
-    const int kIterSize = 1;
-    const int kDevices = 1;
-    RunLeastSquaresSolver(learning_rate, weight_decay, momentum,
-        total_num_iters, kIterSize, kDevices, snapshot);
-
-    // Save the resulting param values.
-    vector<shared_ptr<Blob<Dtype> > > param_copies;
-    const vector<Blob<Dtype>*>& orig_params =
-        solver_->net()->learnable_params();
-    param_copies.resize(orig_params.size());
-    for (int i = 0; i < orig_params.size(); ++i) {
-      param_copies[i].reset(new Blob<Dtype>());
-      const bool kReshape = true;
-      for (int copy_diff = false; copy_diff <= true; ++copy_diff) {
-        param_copies[i]->CopyFrom(*orig_params[i], copy_diff, kReshape);
-      }
-    }
-
-    // Save the solver history
-    vector<shared_ptr<Blob<Dtype> > > history_copies;
-    const vector<shared_ptr<Blob<Dtype> > >& orig_history = solver_->history();
-    history_copies.resize(orig_history.size());
-    for (int i = 0; i < orig_history.size(); ++i) {
-      history_copies[i].reset(new Blob<Dtype>());
-      const bool kReshape = true;
-      for (int copy_diff = false; copy_diff <= true; ++copy_diff) {
-        history_copies[i]->CopyFrom(*orig_history[i], copy_diff, kReshape);
-      }
-    }
-
-    // Run the solver for num_iters iterations and snapshot.
-    snapshot = true;
-    string snapshot_name = RunLeastSquaresSolver(learning_rate, weight_decay,
-        momentum, num_iters, kIterSize, kDevices, snapshot);
-
-    // Reinitialize the solver and run for num_iters more iterations.
-    snapshot = false;
-    RunLeastSquaresSolver(learning_rate, weight_decay, momentum,
-        total_num_iters, kIterSize, kDevices,
-        snapshot, snapshot_name.c_str());
-
-    // Check that params now match.
-    const vector<Blob<Dtype>*>& params = solver_->net()->learnable_params();
-    for (int i = 0; i < params.size(); ++i) {
-      for (int j = 0; j < params[i]->count(); ++j) {
-        EXPECT_EQ(param_copies[i]->cpu_data()[j], params[i]->cpu_data()[j])
-            << "param " << i << " data differed at dim " << j;
-        EXPECT_EQ(param_copies[i]->cpu_diff()[j], params[i]->cpu_diff()[j])
-            << "param " << i << " diff differed at dim " << j;
-      }
-    }
-
-    // Check that history now matches.
-    const vector<shared_ptr<Blob<Dtype> > >& history = solver_->history();
-    for (int i = 0; i < history.size(); ++i) {
-      for (int j = 0; j < history[i]->count(); ++j) {
-        EXPECT_EQ(history_copies[i]->cpu_data()[j], history[i]->cpu_data()[j])
-            << "history blob " << i << " data differed at dim " << j;
-        EXPECT_EQ(history_copies[i]->cpu_diff()[j], history[i]->cpu_diff()[j])
-            << "history blob " << i << " diff differed at dim " << j;
-      }
-    }
-  }
-};
-
-
-template <typename TypeParam>
-class SGDSolverTest : public GradientBasedSolverTest<TypeParam> {
-  typedef typename TypeParam::Dtype Dtype;
-
- protected:
-  virtual void InitSolver(const SolverParameter& param) {
-    this->solver_.reset(new SGDSolver<Dtype>(param));
-  }
-};
-
-TYPED_TEST_CASE(SGDSolverTest, TestDtypesAndDevices);
-
-TYPED_TEST(SGDSolverTest, TestLeastSquaresUpdate) {
-  this->TestLeastSquaresUpdate();
-}
-
-TYPED_TEST(SGDSolverTest, TestLeastSquaresUpdateLROneHundredth) {
-  typedef typename TypeParam::Dtype Dtype;
-  const Dtype kLearningRate = 0.01;
-  this->TestLeastSquaresUpdate(kLearningRate);
-}
-
-TYPED_TEST(SGDSolverTest, TestLeastSquaresUpdateWithWeightDecay) {
-  typedef typename TypeParam::Dtype Dtype;
-  const Dtype kLearningRate = 0.01;
-  const Dtype kWeightDecay = 0.5;
-  const Dtype kMomentum = 0;
-  const int kNumIters = 1;
-  for (int i = 0; i <= kNumIters; ++i) {
-    this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum, i);
-  }
-}
-
-TYPED_TEST(SGDSolverTest, TestLeastSquaresUpdateWithWeightDecayMultiIter) {
-  typedef typename TypeParam::Dtype Dtype;
-  const Dtype kLearningRate = 0.01;
-  const Dtype kWeightDecay = 0.5;
-  const Dtype kMomentum = 0;
-  const int kNumIters = 4;
-  for (int i = 0; i <= kNumIters; ++i) {
-    this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum, i);
-  }
-}
-
-TYPED_TEST(SGDSolverTest, TestLeastSquaresUpdateWithMomentum) {
-  typedef typename TypeParam::Dtype Dtype;
-  const Dtype kLearningRate = 0.01;
-  const Dtype kWeightDecay = 0;
-  const Dtype kMomentum = 0.5;
-  const int kNumIters = 1;
-  for (int i = 0; i <= kNumIters; ++i) {
-    this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum, i);
-  }
-}
-
-TYPED_TEST(SGDSolverTest, TestLeastSquaresUpdateWithMomentumMultiIter) {
-  typedef typename TypeParam::Dtype Dtype;
-  const Dtype kLearningRate = 0.01;
-  const Dtype kWeightDecay = 0;
-  const Dtype kMomentum = 0.5;
-  const int kNumIters = 4;
-  for (int i = 0; i <= kNumIters; ++i) {
-    this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum, i);
-  }
-}
-
-TYPED_TEST(SGDSolverTest, TestLeastSquaresUpdateWithEverything) {
-  typedef typename TypeParam::Dtype Dtype;
-  const Dtype kLearningRate = 0.01;
-  const Dtype kWeightDecay = 0.5;
-  const Dtype kMomentum = 0.5;
-  const int kNumIters = 4;
-  for (int i = 0; i <= kNumIters; ++i) {
-    this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum, i);
-  }
-}
-
-TYPED_TEST(SGDSolverTest, TestLeastSquaresUpdateWithEverythingShare) {
-  typedef typename TypeParam::Dtype Dtype;
-  const Dtype kLearningRate = 0.01;
-  const Dtype kWeightDecay = 0.5;
-  const Dtype kMomentum = 0.5;
-  const int kNumIters = 4;
-  this->share_ = true;
-  for (int i = 0; i <= kNumIters; ++i) {
-    this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum, i);
-  }
-}
-
-TYPED_TEST(SGDSolverTest, TestLeastSquaresUpdateWithEverythingAccum) {
-  typedef typename TypeParam::Dtype Dtype;
-  const Dtype kLearningRate = 0.01;
-  const Dtype kWeightDecay = 0.5;
-  const Dtype kMomentum = 0.9;
-  const int kNumIters = 4;
-  const int kIterSize = 2;
-  this->CheckAccumulation(kLearningRate, kWeightDecay, kMomentum, kNumIters,
-      kIterSize);
-}
-
-TYPED_TEST(SGDSolverTest, TestLeastSquaresUpdateWithEverythingAccumShare) {
-  typedef typename TypeParam::Dtype Dtype;
-  const Dtype kLearningRate = 0.01;
-  const Dtype kWeightDecay = 0.5;
-  const Dtype kMomentum = 0.9;
-  const int kNumIters = 4;
-  const int kIterSize = 2;
-  this->share_ = true;
-  this->CheckAccumulation(kLearningRate, kWeightDecay, kMomentum, kNumIters,
-      kIterSize);
-}
-
-TYPED_TEST(SGDSolverTest, TestSnapshot) {
-  typedef typename TypeParam::Dtype Dtype;
-  const Dtype kLearningRate = 0.01;
-  const Dtype kWeightDecay = 0.5;
-  const Dtype kMomentum = 0.9;
-  const int kNumIters = 4;
-  for (int i = 1; i <= kNumIters; ++i) {
-    this->TestSnapshot(kLearningRate, kWeightDecay, kMomentum, i);
-  }
-}
-
-TYPED_TEST(SGDSolverTest, TestSnapshotShare) {
-  typedef typename TypeParam::Dtype Dtype;
-  const Dtype kLearningRate = 0.01;
-  const Dtype kWeightDecay = 0.5;
-  const Dtype kMomentum = 0.9;
-  const int kNumIters = 4;
-  this->share_ = true;
-  for (int i = 1; i <= kNumIters; ++i) {
-    this->TestSnapshot(kLearningRate, kWeightDecay, kMomentum, i);
-  }
-}
-
-
-template <typename TypeParam>
-class AdaGradSolverTest : public GradientBasedSolverTest<TypeParam> {
-  typedef typename TypeParam::Dtype Dtype;
-
- protected:
-  virtual void InitSolver(const SolverParameter& param) {
-    this->solver_.reset(new AdaGradSolver<Dtype>(param));
-  }
-};
-
-TYPED_TEST_CASE(AdaGradSolverTest, TestDtypesAndDevices);
-
-TYPED_TEST(AdaGradSolverTest, TestAdaGradLeastSquaresUpdate) {
-  this->TestLeastSquaresUpdate();
-}
-
-TYPED_TEST(AdaGradSolverTest, TestAdaGradLeastSquaresUpdateLROneHundredth) {
-  typedef typename TypeParam::Dtype Dtype;
-  const Dtype kLearningRate = 0.01;
-  this->TestLeastSquaresUpdate(kLearningRate);
-}
-
-TYPED_TEST(AdaGradSolverTest, TestAdaGradLeastSquaresUpdateWithWeightDecay) {
-  typedef typename TypeParam::Dtype Dtype;
-  const Dtype kLearningRate = 0.01;
-  const Dtype kWeightDecay = 0.5;
-  this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay);
-}
-
-TYPED_TEST(AdaGradSolverTest, TestAdaGradLeastSquaresUpdateWithEverything) {
-  typedef typename TypeParam::Dtype Dtype;
-  const Dtype kLearningRate = 0.01;
-  const Dtype kWeightDecay = 0.5;
-  const Dtype kMomentum = 0;
-  const int kNumIters = 4;
-  for (int i = 0; i <= kNumIters; ++i) {
-    this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum, i);
-  }
-}
-
-TYPED_TEST(AdaGradSolverTest,
-      TestAdaGradLeastSquaresUpdateWithEverythingShare) {
-  typedef typename TypeParam::Dtype Dtype;
-  const Dtype kLearningRate = 0.01;
-  const Dtype kWeightDecay = 0.5;
-  const Dtype kMomentum = 0;
-  const int kNumIters = 4;
-  this->share_ = true;
-  for (int i = 0; i <= kNumIters; ++i) {
-    this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum, i);
-  }
-}
-
-TYPED_TEST(AdaGradSolverTest, TestLeastSquaresUpdateWithEverythingAccum) {
-  typedef typename TypeParam::Dtype Dtype;
-  const Dtype kLearningRate = 0.01;
-  const Dtype kWeightDecay = 0.5;
-  const Dtype kMomentum = 0;
-  const int kNumIters = 4;
-  const int kIterSize = 2;
-  this->CheckAccumulation(kLearningRate, kWeightDecay, kMomentum, kNumIters,
-      kIterSize);
-}
-
-TYPED_TEST(AdaGradSolverTest, TestLeastSquaresUpdateWithEverythingAccumShare) {
-  typedef typename TypeParam::Dtype Dtype;
-  const Dtype kLearningRate = 0.01;
-  const Dtype kWeightDecay = 0.5;
-  const Dtype kMomentum = 0;
-  const int kNumIters = 4;
-  const int kIterSize = 2;
-  this->share_ = true;
-  this->CheckAccumulation(kLearningRate, kWeightDecay, kMomentum, kNumIters,
-      kIterSize);
-}
-
-TYPED_TEST(AdaGradSolverTest, TestSnapshot) {
-  typedef typename TypeParam::Dtype Dtype;
-  const Dtype kLearningRate = 0.01;
-  const Dtype kWeightDecay = 0.5;
-  const Dtype kMomentum = 0;
-  const int kNumIters = 4;
-  for (int i = 1; i <= kNumIters; ++i) {
-    this->TestSnapshot(kLearningRate, kWeightDecay, kMomentum, i);
-  }
-}
-
-TYPED_TEST(AdaGradSolverTest, TestSnapshotShare) {
-  typedef typename TypeParam::Dtype Dtype;
-  const Dtype kLearningRate = 0.01;
-  const Dtype kWeightDecay = 0.5;
-  const Dtype kMomentum = 0;
-  const int kNumIters = 4;
-  this->share_ = true;
-  for (int i = 1; i <= kNumIters; ++i) {
-    this->TestSnapshot(kLearningRate, kWeightDecay, kMomentum, i);
-  }
-}
-
-
-template <typename TypeParam>
-class NesterovSolverTest : public GradientBasedSolverTest<TypeParam> {
-  typedef typename TypeParam::Dtype Dtype;
-
- protected:
-  virtual void InitSolver(const SolverParameter& param) {
-    this->solver_.reset(new NesterovSolver<Dtype>(param));
-  }
-};
-
-TYPED_TEST_CASE(NesterovSolverTest, TestDtypesAndDevices);
-
-TYPED_TEST(NesterovSolverTest, TestNesterovLeastSquaresUpdate) {
-  this->TestLeastSquaresUpdate();
-}
-
-TYPED_TEST(NesterovSolverTest, TestNesterovLeastSquaresUpdateLROneHundredth) {
-  typedef typename TypeParam::Dtype Dtype;
-  const Dtype kLearningRate = 0.01;
-  this->TestLeastSquaresUpdate(kLearningRate);
-}
-
-TYPED_TEST(NesterovSolverTest, TestNesterovLeastSquaresUpdateWithWeightDecay) {
-  typedef typename TypeParam::Dtype Dtype;
-  const Dtype kLearningRate = 0.01;
-  const Dtype kWeightDecay = 0.5;
-  this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay);
-}
-
-TYPED_TEST(NesterovSolverTest,
-           TestNesterovLeastSquaresUpdateWithWeightDecayMultiIter) {
-  typedef typename TypeParam::Dtype Dtype;
-  const Dtype kLearningRate = 0.01;
-  const Dtype kWeightDecay = 0.5;
-  const Dtype kMomentum = 0;
-  const int kNumIters = 4;
-  for (int i = 0; i <= kNumIters; ++i) {
-    this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum, i);
-  }
-}
-
-TYPED_TEST(NesterovSolverTest, TestNesterovLeastSquaresUpdateWithMomentum) {
-  typedef typename TypeParam::Dtype Dtype;
-  const Dtype kLearningRate = 0.01;
-  const Dtype kWeightDecay = 0;
-  const Dtype kMomentum = 0.5;
-  const int kNumIters = 1;
-  for (int i = 0; i <= kNumIters; ++i) {
-    this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum, i);
-  }
-}
-
-TYPED_TEST(NesterovSolverTest, TestLeastSquaresUpdateWithMomentumMultiIter) {
-  typedef typename TypeParam::Dtype Dtype;
-  const Dtype kLearningRate = 0.01;
-  const Dtype kWeightDecay = 0;
-  const Dtype kMomentum = 0.5;
-  const int kNumIters = 4;
-  for (int i = 0; i <= kNumIters; ++i) {
-    this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum, i);
-  }
-}
-
-TYPED_TEST(NesterovSolverTest, TestNesterovLeastSquaresUpdateWithEverything) {
-  typedef typename TypeParam::Dtype Dtype;
-  const Dtype kLearningRate = 0.01;
-  const Dtype kWeightDecay = 0.5;
-  const Dtype kMomentum = 0.9;
-  const int kNumIters = 4;
-  for (int i = 0; i <= kNumIters; ++i) {
-    this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum, i);
-  }
-}
-
-TYPED_TEST(NesterovSolverTest,
-           TestNesterovLeastSquaresUpdateWithEverythingShare) {
-  typedef typename TypeParam::Dtype Dtype;
-  const Dtype kLearningRate = 0.01;
-  const Dtype kWeightDecay = 0.5;
-  const Dtype kMomentum = 0.9;
-  const int kNumIters = 4;
-  this->share_ = true;
-  for (int i = 0; i <= kNumIters; ++i) {
-    this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum, i);
-  }
-}
-
-TYPED_TEST(NesterovSolverTest, TestLeastSquaresUpdateWithEverythingAccum) {
-  typedef typename TypeParam::Dtype Dtype;
-  const Dtype kLearningRate = 0.01;
-  const Dtype kWeightDecay = 0.5;
-  const Dtype kMomentum = 0.9;
-  const int kNumIters = 4;
-  const int kIterSize = 2;
-  this->CheckAccumulation(kLearningRate, kWeightDecay, kMomentum, kNumIters,
-      kIterSize);
-}
-
-TYPED_TEST(NesterovSolverTest, TestLeastSquaresUpdateWithEverythingAccumShare) {
-  typedef typename TypeParam::Dtype Dtype;
-  const Dtype kLearningRate = 0.01;
-  const Dtype kWeightDecay = 0.5;
-  const Dtype kMomentum = 0.9;
-  const int kNumIters = 4;
-  const int kIterSize = 2;
-  this->share_ = true;
-  this->CheckAccumulation(kLearningRate, kWeightDecay, kMomentum, kNumIters,
-      kIterSize);
-}
-
-TYPED_TEST(NesterovSolverTest, TestSnapshot) {
-  typedef typename TypeParam::Dtype Dtype;
-  const Dtype kLearningRate = 0.01;
-  const Dtype kWeightDecay = 0.5;
-  const Dtype kMomentum = 0.9;
-  const int kNumIters = 4;
-  for (int i = 1; i <= kNumIters; ++i) {
-    this->TestSnapshot(kLearningRate, kWeightDecay, kMomentum, i);
-  }
-}
-
-TYPED_TEST(NesterovSolverTest, TestSnapshotShare) {
-  typedef typename TypeParam::Dtype Dtype;
-  const Dtype kLearningRate = 0.01;
-  const Dtype kWeightDecay = 0.5;
-  const Dtype kMomentum = 0.9;
-  const int kNumIters = 4;
-  this->share_ = true;
-  for (int i = 1; i <= kNumIters; ++i) {
-    this->TestSnapshot(kLearningRate, kWeightDecay, kMomentum, i);
-  }
-}
-
-template <typename TypeParam>
-class AdaDeltaSolverTest : public GradientBasedSolverTest<TypeParam> {
-  typedef typename TypeParam::Dtype Dtype;
-
- protected:
-  virtual void InitSolver(const SolverParameter& param) {
-    this->solver_.reset(new AdaDeltaSolver<Dtype>(param));
-  }
-};
-
-TYPED_TEST_CASE(AdaDeltaSolverTest, TestDtypesAndDevices);
-
-TYPED_TEST(AdaDeltaSolverTest, TestAdaDeltaLeastSquaresUpdate) {
-  typedef typename TypeParam::Dtype Dtype;
-  const Dtype kLearningRate = 0.1;
-  this->TestLeastSquaresUpdate(kLearningRate);
-}
-
-TYPED_TEST(AdaDeltaSolverTest, TestAdaDeltaLeastSquaresUpdateWithWeightDecay) {
-  typedef typename TypeParam::Dtype Dtype;
-  const Dtype kLearningRate = 0.1;
-  const Dtype kWeightDecay = 0.5;
-  const Dtype kMomentum = 0.95;
-  this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum);
-}
-
-TYPED_TEST(AdaDeltaSolverTest, TestAdaDeltaLeastSquaresUpdateWithHalfMomentum) {
-  typedef typename TypeParam::Dtype Dtype;
-  const Dtype kLearningRate = 0.1;
-  const Dtype kWeightDecay = 0.0;
-  const Dtype kMomentum = 0.5;
-  const int kNumIters = 1;
-  for (int i = 0; i <= kNumIters; ++i) {
-    this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum);
-  }
-}
-
-TYPED_TEST(AdaDeltaSolverTest, TestAdaDeltaLeastSquaresUpdateWithMomentum) {
-  typedef typename TypeParam::Dtype Dtype;
-  const Dtype kLearningRate = 0.1;
-  const Dtype kWeightDecay = 0.0;
-  const Dtype kMomentum = 0.95;
-  const int kNumIters = 1;
-  for (int i = 0; i <= kNumIters; ++i) {
-    this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum);
-  }
-}
-
-TYPED_TEST(AdaDeltaSolverTest, TestLeastSquaresUpdateWithMomentumMultiIter) {
-  typedef typename TypeParam::Dtype Dtype;
-  const Dtype kLearningRate = 0.1;
-  const Dtype kWeightDecay = 0.0;
-  const Dtype kMomentum = 0.95;
-  const int kNumIters = 4;
-  for (int i = 0; i <= kNumIters; ++i) {
-    this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum, i);
-  }
-}
-
-TYPED_TEST(AdaDeltaSolverTest, TestAdaDeltaLeastSquaresUpdateWithEverything) {
-  typedef typename TypeParam::Dtype Dtype;
-  const Dtype kLearningRate = 0.1;
-  const Dtype kWeightDecay = 0.1;
-  const Dtype kMomentum = 0.95;
-  const int kNumIters = 4;
-  for (int i = 0; i <= kNumIters; ++i) {
-    this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum, i);
-  }
-}
-
-TYPED_TEST(AdaDeltaSolverTest,
-           TestAdaDeltaLeastSquaresUpdateWithEverythingShare) {
-  typedef typename TypeParam::Dtype Dtype;
-  const Dtype kLearningRate = 0.1;
-  const Dtype kWeightDecay = 0.1;
-  const Dtype kMomentum = 0.95;
-  const int kNumIters = 4;
-  this->share_ = true;
-  for (int i = 0; i <= kNumIters; ++i) {
-    this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum, i);
-  }
-}
-
-TYPED_TEST(AdaDeltaSolverTest, TestLeastSquaresUpdateWithEverythingAccum) {
-  typedef typename TypeParam::Dtype Dtype;
-  const Dtype kLearningRate = 0.1;
-  const Dtype kWeightDecay = 0.1;
-  const Dtype kMomentum = 0.95;
-  const int kNumIters = 4;
-  const int kIterSize = 2;
-  this->CheckAccumulation(kLearningRate, kWeightDecay, kMomentum, kNumIters,
-      kIterSize);
-}
-
-TYPED_TEST(AdaDeltaSolverTest, TestLeastSquaresUpdateWithEverythingAccumShare) {
-  typedef typename TypeParam::Dtype Dtype;
-  const Dtype kLearningRate = 0.1;
-  const Dtype kWeightDecay = 0.1;
-  const Dtype kMomentum = 0.95;
-  const int kNumIters = 4;
-  const int kIterSize = 2;
-  this->share_ = true;
-  this->CheckAccumulation(kLearningRate, kWeightDecay, kMomentum, kNumIters,
-      kIterSize);
-}
-
-TYPED_TEST(AdaDeltaSolverTest, TestSnapshot) {
-  typedef typename TypeParam::Dtype Dtype;
-  const Dtype kLearningRate = 0.1;
-  const Dtype kWeightDecay = 0.1;
-  const Dtype kMomentum = 0.95;
-  const int kNumIters = 4;
-  for (int i = 1; i <= kNumIters; ++i) {
-    this->TestSnapshot(kLearningRate, kWeightDecay, kMomentum, i);
-  }
-}
-
-TYPED_TEST(AdaDeltaSolverTest, TestSnapshotShare) {
-  typedef typename TypeParam::Dtype Dtype;
-  const Dtype kLearningRate = 0.1;
-  const Dtype kWeightDecay = 0.1;
-  const Dtype kMomentum = 0.95;
-  const int kNumIters = 4;
-  this->share_ = true;
-  for (int i = 1; i <= kNumIters; ++i) {
-    this->TestSnapshot(kLearningRate, kWeightDecay, kMomentum, i);
-  }
-}
-
-template <typename TypeParam>
-class AdamSolverTest : public GradientBasedSolverTest<TypeParam> {
-  typedef typename TypeParam::Dtype Dtype;
-
- protected:
-  virtual void InitSolver(const SolverParameter& param) {
-    SolverParameter new_param = param;
-    const Dtype momentum = 0.9;
-    new_param.set_momentum(momentum);
-    const Dtype momentum2 = 0.999;
-    new_param.set_momentum2(momentum2);
-    this->solver_.reset(new AdamSolver<Dtype>(new_param));
-  }
-};
-
-TYPED_TEST_CASE(AdamSolverTest, TestDtypesAndDevices);
-
-TYPED_TEST(AdamSolverTest, TestAdamLeastSquaresUpdate) {
-  typedef typename TypeParam::Dtype Dtype;
-  const Dtype kLearningRate = 0.01;
-  const Dtype kWeightDecay = 0;
-  const Dtype kMomentum = 0.9;
-  this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum);
-}
-
-TYPED_TEST(AdamSolverTest, TestAdamLeastSquaresUpdateWithWeightDecay) {
-  typedef typename TypeParam::Dtype Dtype;
-  const Dtype kLearningRate = 0.01;
-  const Dtype kWeightDecay = 0.5;
-  const Dtype kMomentum = 0.9;
-  this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum);
-}
-
-TYPED_TEST(AdamSolverTest, TestAdamLeastSquaresUpdateWithEverything) {
-  typedef typename TypeParam::Dtype Dtype;
-  const Dtype kLearningRate = 0.01;
-  const Dtype kWeightDecay = 0.5;
-  const Dtype kMomentum = 0.9;
-  const int kNumIters = 4;
-  for (int i = 0; i <= kNumIters; ++i) {
-    this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum, i);
-  }
-}
-
-TYPED_TEST(AdamSolverTest, TestAdamLeastSquaresUpdateWithEverythingShare) {
-  typedef typename TypeParam::Dtype Dtype;
-  const Dtype kLearningRate = 0.01;
-  const Dtype kWeightDecay = 0.5;
-  const Dtype kMomentum = 0.9;
-  const int kNumIters = 4;
-  this->share_ = true;
-  for (int i = 0; i <= kNumIters; ++i) {
-    this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum, i);
-  }
-}
-
-TYPED_TEST(AdamSolverTest, TestLeastSquaresUpdateWithEverythingAccum) {
-  typedef typename TypeParam::Dtype Dtype;
-  const Dtype kLearningRate = 0.01;
-  const Dtype kWeightDecay = 0.5;
-  const Dtype kMomentum = 0.9;
-  const int kNumIters = 4;
-  const int kIterSize = 2;
-  this->CheckAccumulation(kLearningRate, kWeightDecay, kMomentum, kNumIters,
-      kIterSize);
-}
-
-TYPED_TEST(AdamSolverTest, TestLeastSquaresUpdateWithEverythingAccumShare) {
-  typedef typename TypeParam::Dtype Dtype;
-  const Dtype kLearningRate = 0.01;
-  const Dtype kWeightDecay = 0.5;
-  const Dtype kMomentum = 0.9;
-  const int kNumIters = 4;
-  const int kIterSize = 2;
-  this->share_ = true;
-  this->CheckAccumulation(kLearningRate, kWeightDecay, kMomentum, kNumIters,
-      kIterSize);
-}
-
-TYPED_TEST(AdamSolverTest, TestSnapshot) {
-  typedef typename TypeParam::Dtype Dtype;
-  const Dtype kLearningRate = 0.01;
-  const Dtype kWeightDecay = 0.5;
-  const Dtype kMomentum = 0.9;
-  const int kNumIters = 4;
-  for (int i = 1; i <= kNumIters; ++i) {
-    this->TestSnapshot(kLearningRate, kWeightDecay, kMomentum, i);
-  }
-}
-
-TYPED_TEST(AdamSolverTest, TestSnapshotShare) {
-  typedef typename TypeParam::Dtype Dtype;
-  const Dtype kLearningRate = 0.01;
-  const Dtype kWeightDecay = 0.5;
-  const Dtype kMomentum = 0.9;
-  const int kNumIters = 4;
-  this->share_ = true;
-  for (int i = 1; i <= kNumIters; ++i) {
-    this->TestSnapshot(kLearningRate, kWeightDecay, kMomentum, i);
-  }
-}
-
-template <typename TypeParam>
-class RMSPropSolverTest : public GradientBasedSolverTest<TypeParam> {
-  typedef typename TypeParam::Dtype Dtype;
-
- protected:
-  virtual void InitSolver(const SolverParameter& param) {
-    const Dtype rms_decay = 0.95;
-    SolverParameter new_param = param;
-    new_param.set_rms_decay(rms_decay);
-    this->solver_.reset(new RMSPropSolver<Dtype>(new_param));
-  }
-};
-
-TYPED_TEST_CASE(RMSPropSolverTest, TestDtypesAndDevices);
-
-TYPED_TEST(RMSPropSolverTest, TestRMSPropLeastSquaresUpdateWithWeightDecay) {
-  typedef typename TypeParam::Dtype Dtype;
-  const Dtype kLearningRate = 1.0;
-  const Dtype kWeightDecay = 0.5;
-  this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay);
-}
-
-TYPED_TEST(RMSPropSolverTest, TestRMSPropLeastSquaresUpdateWithRmsDecay) {
-  typedef typename TypeParam::Dtype Dtype;
-  const Dtype kLearningRate = 0.01;
-  const Dtype kWeightDecay = 0.0;
-  const Dtype kMomentum = 0.0;
-  const int kNumIters = 4;
-  for (int i = 0; i <= kNumIters; ++i) {
-    this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum, i);
-  }
-}
-
-TYPED_TEST(RMSPropSolverTest, TestRMSPropLeastSquaresUpdateWithEverything) {
-  typedef typename TypeParam::Dtype Dtype;
-  const Dtype kLearningRate = 0.01;
-  const Dtype kWeightDecay = 0.5;
-  const Dtype kMomentum = 0.0;
-  const int kNumIters = 4;
-  for (int i = 0; i <= kNumIters; ++i) {
-    this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum, i);
-  }
-}
-
-TYPED_TEST(RMSPropSolverTest,
-      TestRMSPropLeastSquaresUpdateWithEverythingShare) {
-  typedef typename TypeParam::Dtype Dtype;
-  const Dtype kLearningRate = 0.01;
-  const Dtype kWeightDecay = 0.5;
-  const Dtype kMomentum = 0.0;
-  const int kNumIters = 4;
-  this->share_ = true;
-  for (int i = 0; i <= kNumIters; ++i) {
-    this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum, i);
-  }
-}
-
-TYPED_TEST(RMSPropSolverTest, TestLeastSquaresUpdateWithEverythingAccum) {
-  typedef typename TypeParam::Dtype Dtype;
-  const Dtype kLearningRate = 0.01;
-  const Dtype kWeightDecay = 0.5;
-  const Dtype kMomentum = 0.0;
-  const int kNumIters = 4;
-  const int kIterSize = 2;
-  this->CheckAccumulation(kLearningRate, kWeightDecay, kMomentum, kNumIters,
-      kIterSize);
-}
-
-TYPED_TEST(RMSPropSolverTest, TestLeastSquaresUpdateWithEverythingAccumShare) {
-  typedef typename TypeParam::Dtype Dtype;
-  const Dtype kLearningRate = 0.01;
-  const Dtype kWeightDecay = 0.5;
-  const Dtype kMomentum = 0.0;
-  const int kNumIters = 4;
-  const int kIterSize = 2;
-  this->share_ = true;
-  this->CheckAccumulation(kLearningRate, kWeightDecay, kMomentum, kNumIters,
-      kIterSize);
-}
-
-TYPED_TEST(RMSPropSolverTest, TestSnapshot) {
-  typedef typename TypeParam::Dtype Dtype;
-  const Dtype kLearningRate = 0.01;
-  const Dtype kWeightDecay = 0.5;
-  const Dtype kMomentum = 0;
-  const int kNumIters = 4;
-  for (int i = 1; i <= kNumIters; ++i) {
-    this->TestSnapshot(kLearningRate, kWeightDecay, kMomentum, i);
-  }
-}
-
-TYPED_TEST(RMSPropSolverTest, TestSnapshotShare) {
-  typedef typename TypeParam::Dtype Dtype;
-  const Dtype kLearningRate = 0.01;
-  const Dtype kWeightDecay = 0.5;
-  const Dtype kMomentum = 0;
-  const int kNumIters = 4;
-  this->share_ = true;
-  for (int i = 1; i <= kNumIters; ++i) {
-    this->TestSnapshot(kLearningRate, kWeightDecay, kMomentum, i);
-  }
-}
-
-}  // namespace caffe
diff --git a/src/caffe/test/test_hdf5_output_layer.cpp b/src/caffe/test/test_hdf5_output_layer.cpp
deleted file mode 100644
index 3833ebf..0000000
--- a/src/caffe/test/test_hdf5_output_layer.cpp
+++ /dev/null
@@ -1,121 +0,0 @@
-#include <string>
-#include <vector>
-
-#include "gtest/gtest.h"
-
-#include "caffe/blob.hpp"
-#include "caffe/common.hpp"
-#include "caffe/layers/hdf5_output_layer.hpp"
-#include "caffe/proto/caffe.pb.h"
-#include "caffe/util/hdf5.hpp"
-#include "caffe/util/io.hpp"
-
-#include "caffe/test/test_caffe_main.hpp"
-
-namespace caffe {
-
-template<typename TypeParam>
-class HDF5OutputLayerTest : public MultiDeviceTest<TypeParam> {
-  typedef typename TypeParam::Dtype Dtype;
-
- protected:
-  HDF5OutputLayerTest()
-      : input_file_name_(
-        CMAKE_SOURCE_DIR "caffe/test/test_data/sample_data.h5"),
-        blob_data_(new Blob<Dtype>()),
-        blob_label_(new Blob<Dtype>()),
-        num_(5),
-        channels_(8),
-        height_(5),
-        width_(5) {
-    MakeTempFilename(&output_file_name_);
-  }
-
-  virtual ~HDF5OutputLayerTest() {
-    delete blob_data_;
-    delete blob_label_;
-  }
-
-  void CheckBlobEqual(const Blob<Dtype>& b1, const Blob<Dtype>& b2);
-
-  string output_file_name_;
-  string input_file_name_;
-  Blob<Dtype>* const blob_data_;
-  Blob<Dtype>* const blob_label_;
-  vector<Blob<Dtype>*> blob_bottom_vec_;
-  vector<Blob<Dtype>*> blob_top_vec_;
-  int num_;
-  int channels_;
-  int height_;
-  int width_;
-};
-
-template<typename TypeParam>
-void HDF5OutputLayerTest<TypeParam>::CheckBlobEqual(const Blob<Dtype>& b1,
-                                                    const Blob<Dtype>& b2) {
-  EXPECT_EQ(b1.num(), b2.num());
-  EXPECT_EQ(b1.channels(), b2.channels());
-  EXPECT_EQ(b1.height(), b2.height());
-  EXPECT_EQ(b1.width(), b2.width());
-  for (int n = 0; n < b1.num(); ++n) {
-    for (int c = 0; c < b1.channels(); ++c) {
-      for (int h = 0; h < b1.height(); ++h) {
-        for (int w = 0; w < b1.width(); ++w) {
-          EXPECT_EQ(b1.data_at(n, c, h, w), b2.data_at(n, c, h, w));
-        }
-      }
-    }
-  }
-}
-
-TYPED_TEST_CASE(HDF5OutputLayerTest, TestDtypesAndDevices);
-
-TYPED_TEST(HDF5OutputLayerTest, TestForward) {
-  typedef typename TypeParam::Dtype Dtype;
-  LOG(INFO) << "Loading HDF5 file " << this->input_file_name_;
-  hid_t file_id = H5Fopen(this->input_file_name_.c_str(), H5F_ACC_RDONLY,
-                          H5P_DEFAULT);
-  ASSERT_GE(file_id, 0)<< "Failed to open HDF5 file" <<
-      this->input_file_name_;
-  hdf5_load_nd_dataset(file_id, HDF5_DATA_DATASET_NAME, 0, 4,
-                       this->blob_data_);
-  hdf5_load_nd_dataset(file_id, HDF5_DATA_LABEL_NAME, 0, 4,
-                       this->blob_label_);
-  herr_t status = H5Fclose(file_id);
-  EXPECT_GE(status, 0)<< "Failed to close HDF5 file " <<
-      this->input_file_name_;
-  this->blob_bottom_vec_.push_back(this->blob_data_);
-  this->blob_bottom_vec_.push_back(this->blob_label_);
-
-  LayerParameter param;
-  param.mutable_hdf5_output_param()->set_file_name(this->output_file_name_);
-  // This code block ensures that the layer is deconstructed and
-  //   the output hdf5 file is closed.
-  {
-    HDF5OutputLayer<Dtype> layer(param);
-    layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-    EXPECT_EQ(layer.file_name(), this->output_file_name_);
-    layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  }
-  file_id = H5Fopen(this->output_file_name_.c_str(), H5F_ACC_RDONLY,
-                          H5P_DEFAULT);
-  ASSERT_GE(
-    file_id, 0)<< "Failed to open HDF5 file" <<
-          this->input_file_name_;
-
-  Blob<Dtype>* blob_data = new Blob<Dtype>();
-  hdf5_load_nd_dataset(file_id, HDF5_DATA_DATASET_NAME, 0, 4,
-                       blob_data);
-  this->CheckBlobEqual(*(this->blob_data_), *blob_data);
-
-  Blob<Dtype>* blob_label = new Blob<Dtype>();
-  hdf5_load_nd_dataset(file_id, HDF5_DATA_LABEL_NAME, 0, 4,
-                       blob_label);
-  this->CheckBlobEqual(*(this->blob_label_), *blob_label);
-
-  status = H5Fclose(file_id);
-  EXPECT_GE(status, 0) << "Failed to close HDF5 file " <<
-      this->output_file_name_;
-}
-
-}  // namespace caffe
diff --git a/src/caffe/test/test_hdf5data_layer.cpp b/src/caffe/test/test_hdf5data_layer.cpp
deleted file mode 100644
index 8884ce9..0000000
--- a/src/caffe/test/test_hdf5data_layer.cpp
+++ /dev/null
@@ -1,136 +0,0 @@
-#include <string>
-#include <vector>
-
-#include "hdf5.h"
-
-#include "gtest/gtest.h"
-
-#include "caffe/blob.hpp"
-#include "caffe/common.hpp"
-#include "caffe/layers/hdf5_data_layer.hpp"
-#include "caffe/proto/caffe.pb.h"
-
-#include "caffe/test/test_caffe_main.hpp"
-
-namespace caffe {
-
-template <typename TypeParam>
-class HDF5DataLayerTest : public MultiDeviceTest<TypeParam> {
-  typedef typename TypeParam::Dtype Dtype;
-
- protected:
-  HDF5DataLayerTest()
-      : filename(NULL),
-        blob_top_data_(new Blob<Dtype>()),
-        blob_top_label_(new Blob<Dtype>()),
-        blob_top_label2_(new Blob<Dtype>()) {}
-  virtual void SetUp() {
-    blob_top_vec_.push_back(blob_top_data_);
-    blob_top_vec_.push_back(blob_top_label_);
-    blob_top_vec_.push_back(blob_top_label2_);
-
-    // Check out generate_sample_data.py in the same directory.
-    filename = new string(
-    CMAKE_SOURCE_DIR "caffe/test/test_data/sample_data_list.txt" CMAKE_EXT);
-    LOG(INFO)<< "Using sample HDF5 data file " << filename;
-  }
-
-  virtual ~HDF5DataLayerTest() {
-    delete blob_top_data_;
-    delete blob_top_label_;
-    delete blob_top_label2_;
-    delete filename;
-  }
-
-  string* filename;
-  Blob<Dtype>* const blob_top_data_;
-  Blob<Dtype>* const blob_top_label_;
-  Blob<Dtype>* const blob_top_label2_;
-  vector<Blob<Dtype>*> blob_bottom_vec_;
-  vector<Blob<Dtype>*> blob_top_vec_;
-};
-
-TYPED_TEST_CASE(HDF5DataLayerTest, TestDtypesAndDevices);
-
-TYPED_TEST(HDF5DataLayerTest, TestRead) {
-  typedef typename TypeParam::Dtype Dtype;
-  // Create LayerParameter with the known parameters.
-  // The data file we are reading has 10 rows and 8 columns,
-  // with values from 0 to 10*8 reshaped in row-major order.
-  LayerParameter param;
-  param.add_top("data");
-  param.add_top("label");
-  param.add_top("label2");
-
-  HDF5DataParameter* hdf5_data_param = param.mutable_hdf5_data_param();
-  int batch_size = 5;
-  hdf5_data_param->set_batch_size(batch_size);
-  hdf5_data_param->set_source(*(this->filename));
-  int num_cols = 8;
-  int height = 6;
-  int width = 5;
-
-  // Test that the layer setup got the correct parameters.
-  HDF5DataLayer<Dtype> layer(param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  EXPECT_EQ(this->blob_top_data_->num(), batch_size);
-  EXPECT_EQ(this->blob_top_data_->channels(), num_cols);
-  EXPECT_EQ(this->blob_top_data_->height(), height);
-  EXPECT_EQ(this->blob_top_data_->width(), width);
-
-  EXPECT_EQ(this->blob_top_label_->num_axes(), 2);
-  EXPECT_EQ(this->blob_top_label_->shape(0), batch_size);
-  EXPECT_EQ(this->blob_top_label_->shape(1), 1);
-
-  EXPECT_EQ(this->blob_top_label2_->num_axes(), 2);
-  EXPECT_EQ(this->blob_top_label2_->shape(0), batch_size);
-  EXPECT_EQ(this->blob_top_label2_->shape(1), 1);
-
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-
-  // Go through the data 10 times (5 batches).
-  const int data_size = num_cols * height * width;
-  for (int iter = 0; iter < 10; ++iter) {
-    layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-
-    // On even iterations, we're reading the first half of the data.
-    // On odd iterations, we're reading the second half of the data.
-    // NB: label is 1-indexed
-    int label_offset = 1 + ((iter % 2 == 0) ? 0 : batch_size);
-    int label2_offset = 1 + label_offset;
-    int data_offset = (iter % 2 == 0) ? 0 : batch_size * data_size;
-
-    // Every two iterations we are reading the second file,
-    // which has the same labels, but data is offset by total data size,
-    // which is 2400 (see generate_sample_data).
-    int file_offset = (iter % 4 < 2) ? 0 : 2400;
-
-    for (int i = 0; i < batch_size; ++i) {
-      EXPECT_EQ(
-        label_offset + i,
-        this->blob_top_label_->cpu_data()[i]);
-      EXPECT_EQ(
-        label2_offset + i,
-        this->blob_top_label2_->cpu_data()[i]);
-    }
-    for (int i = 0; i < batch_size; ++i) {
-      for (int j = 0; j < num_cols; ++j) {
-        for (int h = 0; h < height; ++h) {
-          for (int w = 0; w < width; ++w) {
-            int idx = (
-              i * num_cols * height * width +
-              j * height * width +
-              h * width + w);
-            EXPECT_EQ(
-              file_offset + data_offset + idx,
-              this->blob_top_data_->cpu_data()[idx])
-              << "debug: i " << i << " j " << j
-              << " iter " << iter;
-          }
-        }
-      }
-    }
-  }
-}
-
-}  // namespace caffe
diff --git a/src/caffe/test/test_hinge_loss_layer.cpp b/src/caffe/test/test_hinge_loss_layer.cpp
deleted file mode 100644
index 8bf89fa..0000000
--- a/src/caffe/test/test_hinge_loss_layer.cpp
+++ /dev/null
@@ -1,74 +0,0 @@
-#include <cmath>
-#include <vector>
-
-#include "gtest/gtest.h"
-
-#include "caffe/blob.hpp"
-#include "caffe/common.hpp"
-#include "caffe/filler.hpp"
-#include "caffe/layers/hinge_loss_layer.hpp"
-
-#include "caffe/test/test_caffe_main.hpp"
-#include "caffe/test/test_gradient_check_util.hpp"
-
-namespace caffe {
-
-template <typename TypeParam>
-class HingeLossLayerTest : public MultiDeviceTest<TypeParam> {
-  typedef typename TypeParam::Dtype Dtype;
-
- protected:
-  HingeLossLayerTest()
-      : blob_bottom_data_(new Blob<Dtype>(10, 5, 1, 1)),
-        blob_bottom_label_(new Blob<Dtype>(10, 1, 1, 1)),
-        blob_top_loss_(new Blob<Dtype>()) {
-    // fill the values
-    Caffe::set_random_seed(1701);
-    FillerParameter filler_param;
-    filler_param.set_std(10);
-    GaussianFiller<Dtype> filler(filler_param);
-    filler.Fill(this->blob_bottom_data_);
-    blob_bottom_vec_.push_back(blob_bottom_data_);
-    for (int i = 0; i < blob_bottom_label_->count(); ++i) {
-      blob_bottom_label_->mutable_cpu_data()[i] = caffe_rng_rand() % 5;
-    }
-    blob_bottom_vec_.push_back(blob_bottom_label_);
-    blob_top_vec_.push_back(blob_top_loss_);
-  }
-  virtual ~HingeLossLayerTest() {
-    delete blob_bottom_data_;
-    delete blob_bottom_label_;
-    delete blob_top_loss_;
-  }
-  Blob<Dtype>* const blob_bottom_data_;
-  Blob<Dtype>* const blob_bottom_label_;
-  Blob<Dtype>* const blob_top_loss_;
-  vector<Blob<Dtype>*> blob_bottom_vec_;
-  vector<Blob<Dtype>*> blob_top_vec_;
-};
-
-TYPED_TEST_CASE(HingeLossLayerTest, TestDtypesAndDevices);
-
-
-TYPED_TEST(HingeLossLayerTest, TestGradientL1) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  HingeLossLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-2, 2e-3, 1701, 1, 0.01);
-  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_, 0);
-}
-
-TYPED_TEST(HingeLossLayerTest, TestGradientL2) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  // Set norm to L2
-  HingeLossParameter* hinge_loss_param = layer_param.mutable_hinge_loss_param();
-  hinge_loss_param->set_norm(HingeLossParameter_Norm_L2);
-  HingeLossLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-2, 1e-2, 1701);
-  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_, 0);
-}
-
-}  // namespace caffe
diff --git a/src/caffe/test/test_im2col_kernel.cu b/src/caffe/test/test_im2col_kernel.cu
deleted file mode 100644
index e3a9791..0000000
--- a/src/caffe/test/test_im2col_kernel.cu
+++ /dev/null
@@ -1,213 +0,0 @@
-#include <vector>
-
-#include "gtest/gtest.h"
-
-#include "caffe/blob.hpp"
-#include "caffe/common.hpp"
-#include "caffe/filler.hpp"
-#include "caffe/layers/im2col_layer.hpp"
-#include "caffe/util/im2col.hpp"
-
-#include "caffe/test/test_caffe_main.hpp"
-
-namespace caffe {
-
-// Forward declare kernel functions
-template <typename Dtype>
-__global__ void im2col_gpu_kernel(const int n, const Dtype* data_im,
-    const int height, const int width, const int kernel_h, const int kernel_w,
-    const int pad_h, const int pad_w,
-    const int stride_h, const int stride_w,
-    const int dilation_h, const int dilation_w,
-    const int height_col, const int width_col,
-    Dtype* data_col);
-
-template <typename Dtype, int num_axes>
-__global__ void im2col_nd_gpu_kernel(const int n, const Dtype* data_im,
-    const int* im_shape, const int* col_shape,
-    const int* kernel_shape, const int* pad, const int* stride,
-    const int* dilation, Dtype* data_col);
-
-template <typename Dtype>
-class Im2colKernelTest : public GPUDeviceTest<Dtype> {
- protected:
-  Im2colKernelTest()
-        // big so launches > 1024 threads
-      : blob_bottom_(new Blob<Dtype>(5, 500, 15, 15)),
-        blob_kernel_shape_(new Blob<int>()),
-        blob_stride_(new Blob<int>()),
-        blob_pad_(new Blob<int>()),
-        blob_dilation_(new Blob<int>()),
-        blob_top_(new Blob<Dtype>()),
-        blob_top_cpu_(new Blob<Dtype>()) {
-    FillerParameter filler_param;
-    GaussianFiller<Dtype> filler(filler_param);
-    filler.Fill(this->blob_bottom_);
-    vector<int> dim_blob_shape(1, 2);
-    blob_kernel_shape_->Reshape(dim_blob_shape);
-    blob_stride_->Reshape(dim_blob_shape);
-    blob_pad_->Reshape(dim_blob_shape);
-    blob_dilation_->Reshape(dim_blob_shape);
-
-    height_ = blob_bottom_->height();
-    width_ = blob_bottom_->width();
-    channels_ = blob_bottom_->channels();
-    pad_ = 0;
-    stride_ = 2;
-    dilation_ = 3;
-    kernel_size_ = 3;
-    height_col_ = (height_ + 2 * pad_ -
-        (dilation_ * (kernel_size_ - 1) + 1)) / stride_ + 1;
-    width_col_ = (width_ + 2 * pad_ -
-        (dilation_ * (kernel_size_ - 1) + 1)) / stride_ + 1;
-
-    for (int i = 0; i < 2; ++i) {
-      blob_kernel_shape_->mutable_cpu_data()[i] = kernel_size_;
-      blob_stride_->mutable_cpu_data()[i] = stride_;
-      blob_pad_->mutable_cpu_data()[i] = pad_;
-      blob_dilation_->mutable_cpu_data()[i] = dilation_;
-    }
-  }
-
-  virtual ~Im2colKernelTest() {
-    delete blob_bottom_;
-    delete blob_top_;
-    delete blob_top_cpu_;
-    delete blob_kernel_shape_;
-    delete blob_stride_;
-    delete blob_pad_;
-    delete blob_dilation_;
-  }
-
-  Blob<int>* const blob_kernel_shape_;
-  Blob<int>* const blob_stride_;
-  Blob<int>* const blob_pad_;
-  Blob<int>* const blob_dilation_;
-  Blob<Dtype>* const blob_bottom_;
-  Blob<Dtype>* const blob_top_;
-  Blob<Dtype>* const blob_top_cpu_;
-  int height_;
-  int width_;
-  int channels_;
-  int pad_;
-  int stride_;
-  int dilation_;
-  int kernel_size_;
-  int height_col_;
-  int width_col_;
-};
-
-TYPED_TEST_CASE(Im2colKernelTest, TestDtypes);
-
-TYPED_TEST(Im2colKernelTest, Test2D) {
-  // Reshape the blobs to correct size for im2col output
-  this->blob_top_->Reshape(this->blob_bottom_->num(),
-          this->channels_ * this->kernel_size_ * this->kernel_size_,
-          this->height_col_,
-          this->width_col_);
-
-  this->blob_top_cpu_->Reshape(this->blob_bottom_->num(),
-          this->channels_ * this->kernel_size_ * this->kernel_size_,
-          this->height_col_,
-          this->width_col_);
-
-  const TypeParam* bottom_data = this->blob_bottom_->gpu_data();
-  TypeParam* top_data = this->blob_top_->mutable_gpu_data();
-  TypeParam* cpu_data = this->blob_top_cpu_->mutable_cpu_data();
-
-  // CPU Version
-  for (int n = 0; n < this->blob_bottom_->num(); ++n) {
-    im2col_cpu(this->blob_bottom_->cpu_data() + this->blob_bottom_->offset(n),
-      this->channels_, this->height_, this->width_,
-      this->kernel_size_, this->kernel_size_, this->pad_, this->pad_,
-      this->stride_, this->stride_, this->dilation_, this->dilation_,
-      cpu_data + this->blob_top_cpu_->offset(n));
-  }
-
-  // GPU version
-  int num_kernels = this->channels_ * this->height_col_ * this->width_col_;
-  int default_grid_dim = CAFFE_GET_BLOCKS(num_kernels);
-
-  // Launch with different grid sizes
-  for (int grid_div = 2; grid_div <= 8; grid_div++) {
-    for (int n = 0; n < this->blob_bottom_->num(); ++n) {
-      int grid_dim = default_grid_dim/grid_div;
-      // NOLINT_NEXT_LINE(whitespace/operators)
-      im2col_gpu_kernel<TypeParam><<<grid_dim, CAFFE_CUDA_NUM_THREADS>>>(
-        num_kernels, bottom_data + this->blob_bottom_->offset(n),
-        this->height_, this->width_, this->kernel_size_, this->kernel_size_,
-        this->pad_, this->pad_, this->stride_, this->stride_,
-        this->dilation_, this->dilation_,
-        this->height_col_, this->width_col_,
-        top_data + this->blob_top_->offset(n));
-      CUDA_POST_KERNEL_CHECK;
-    }
-
-    // Compare results against CPU version
-    for (int i = 0; i < this->blob_top_->count(); ++i) {
-      TypeParam cpuval = cpu_data[i];
-      TypeParam gpuval = this->blob_top_->cpu_data()[i];
-      EXPECT_EQ(cpuval, gpuval);
-      if (cpuval != gpuval) {
-        break;
-      }
-    }
-  }
-}
-
-TYPED_TEST(Im2colKernelTest, TestND) {
-  // Reshape the blobs to correct size for im2col output
-  this->blob_top_->Reshape(this->blob_bottom_->num(),
-      this->channels_ * this->kernel_size_ * this->kernel_size_,
-      this->height_col_,
-      this->width_col_);
-
-  this->blob_top_cpu_->ReshapeLike(*this->blob_top_);
-
-  const TypeParam* bottom_data_cpu = this->blob_bottom_->cpu_data();
-  TypeParam* top_data_cpu = this->blob_top_cpu_->mutable_cpu_data();
-
-  // CPU Version
-  for (int n = 0; n < this->blob_bottom_->num(); ++n) {
-    im2col_nd_cpu(bottom_data_cpu + this->blob_bottom_->offset(n), 2,
-        this->blob_bottom_->shape().data() + 1,
-        this->blob_top_cpu_->shape().data() + 1,
-        this->blob_kernel_shape_->cpu_data(),
-        this->blob_pad_->cpu_data(), this->blob_stride_->cpu_data(),
-        this->blob_dilation_->cpu_data(),
-        top_data_cpu + this->blob_top_cpu_->offset(n));
-  }
-
-  // GPU version
-  int num_kernels = this->channels_ * this->height_col_ * this->width_col_;
-  int default_grid_dim = CAFFE_GET_BLOCKS(num_kernels);
-  const TypeParam* bottom_data_gpu = this->blob_bottom_->gpu_data();
-
-  // Launch with different grid sizes
-  for (int grid_div = 2; grid_div <= 8; grid_div++) {
-    for (int n = 0; n < this->blob_bottom_->num(); ++n) {
-      const int grid_dim = default_grid_dim / grid_div;
-      TypeParam* top_data_gpu = this->blob_top_->mutable_gpu_data();
-      // NOLINT_NEXT_LINE(whitespace/operators)
-      im2col_nd_gpu_kernel<TypeParam, 2><<<grid_dim, CAFFE_CUDA_NUM_THREADS>>>(
-          num_kernels, bottom_data_gpu + this->blob_bottom_->offset(n),
-          this->blob_bottom_->gpu_shape() + 1, this->blob_top_->gpu_shape() + 1,
-          this->blob_kernel_shape_->gpu_data(), this->blob_pad_->gpu_data(),
-          this->blob_stride_->gpu_data(), this->blob_dilation_->gpu_data(),
-          top_data_gpu + this->blob_top_->offset(n));
-      CUDA_POST_KERNEL_CHECK;
-    }
-
-    // Compare results against CPU version
-    for (int i = 0; i < this->blob_top_->count(); ++i) {
-      TypeParam cpuval = top_data_cpu[i];
-      TypeParam gpuval = this->blob_top_->cpu_data()[i];
-      EXPECT_EQ(cpuval, gpuval);
-      if (cpuval != gpuval) {
-        break;
-      }
-    }
-  }
-}
-
-}  // namespace caffe
diff --git a/src/caffe/test/test_im2col_layer.cpp b/src/caffe/test/test_im2col_layer.cpp
deleted file mode 100644
index a7faf18..0000000
--- a/src/caffe/test/test_im2col_layer.cpp
+++ /dev/null
@@ -1,178 +0,0 @@
-#include <vector>
-
-#include "gtest/gtest.h"
-
-#include "caffe/blob.hpp"
-#include "caffe/common.hpp"
-#include "caffe/filler.hpp"
-#include "caffe/layers/im2col_layer.hpp"
-
-#include "caffe/test/test_caffe_main.hpp"
-#include "caffe/test/test_gradient_check_util.hpp"
-
-namespace caffe {
-
-template <typename TypeParam>
-class Im2colLayerTest : public MultiDeviceTest<TypeParam> {
-  typedef typename TypeParam::Dtype Dtype;
- protected:
-  Im2colLayerTest()
-      : blob_bottom_(new Blob<Dtype>(2, 3, 6, 5)),
-        blob_top_(new Blob<Dtype>()) {
-    // fill the values
-    Caffe::set_random_seed(1701);
-    FillerParameter filler_param;
-    GaussianFiller<Dtype> filler(filler_param);
-    filler.Fill(this->blob_bottom_);
-    blob_bottom_vec_.push_back(blob_bottom_);
-    blob_top_vec_.push_back(blob_top_);
-  }
-  virtual ~Im2colLayerTest() { delete blob_bottom_; delete blob_top_; }
-  Blob<Dtype>* const blob_bottom_;
-  Blob<Dtype>* const blob_top_;
-  vector<Blob<Dtype>*> blob_bottom_vec_;
-  vector<Blob<Dtype>*> blob_top_vec_;
-};
-
-TYPED_TEST_CASE(Im2colLayerTest, TestDtypesAndDevices);
-
-TYPED_TEST(Im2colLayerTest, TestSetup) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  ConvolutionParameter* convolution_param =
-      layer_param.mutable_convolution_param();
-  vector<int> bottom_shape;
-  bottom_shape.push_back(2);
-  bottom_shape.push_back(3);
-  bottom_shape.push_back(10);
-  bottom_shape.push_back(11);
-  this->blob_bottom_->Reshape(bottom_shape);
-  convolution_param->add_kernel_size(3);
-  convolution_param->add_stride(2);
-  convolution_param->add_dilation(3);
-  Im2colLayer<Dtype> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  EXPECT_EQ(this->blob_top_->num(), 2);
-  EXPECT_EQ(this->blob_top_->channels(), 27);
-  EXPECT_EQ(this->blob_top_->height(), 2);
-  EXPECT_EQ(this->blob_top_->width(), 3);
-}
-
-TYPED_TEST(Im2colLayerTest, TestForward) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  ConvolutionParameter* convolution_param =
-      layer_param.mutable_convolution_param();
-  convolution_param->add_kernel_size(3);
-  convolution_param->add_stride(2);
-  Im2colLayer<Dtype> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  // We are lazy and will only check the top left block
-  for (int c = 0; c < 27; ++c) {
-    EXPECT_EQ(this->blob_bottom_->data_at(0, (c / 9), (c / 3) % 3, c % 3),
-        this->blob_top_->data_at(0, c, 0, 0));
-  }
-}
-
-TYPED_TEST(Im2colLayerTest, TestGradient) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  ConvolutionParameter* convolution_param =
-      layer_param.mutable_convolution_param();
-  convolution_param->add_kernel_size(3);
-  convolution_param->add_stride(2);
-  Im2colLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-2, 1e-2);
-  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_);
-}
-
-TYPED_TEST(Im2colLayerTest, TestDilatedGradient) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  ConvolutionParameter* convolution_param =
-      layer_param.mutable_convolution_param();
-  vector<int> bottom_shape;
-  bottom_shape.push_back(2);
-  bottom_shape.push_back(3);
-  bottom_shape.push_back(10);
-  bottom_shape.push_back(9);
-  this->blob_bottom_->Reshape(bottom_shape);
-  convolution_param->add_kernel_size(3);
-  convolution_param->add_stride(2);
-  convolution_param->add_dilation(3);
-  Im2colLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-2, 1e-2);
-  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-                                  this->blob_top_vec_);
-}
-
-TYPED_TEST(Im2colLayerTest, TestGradientForceND) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  ConvolutionParameter* convolution_param =
-      layer_param.mutable_convolution_param();
-  convolution_param->add_kernel_size(3);
-  convolution_param->add_stride(2);
-  convolution_param->set_force_nd_im2col(true);
-  Im2colLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-2, 1e-2);
-  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_);
-}
-
-TYPED_TEST(Im2colLayerTest, TestDilatedGradientForceND) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  ConvolutionParameter* convolution_param =
-      layer_param.mutable_convolution_param();
-  vector<int> bottom_shape;
-  bottom_shape.push_back(2);
-  bottom_shape.push_back(3);
-  bottom_shape.push_back(10);
-  bottom_shape.push_back(9);
-  this->blob_bottom_->Reshape(bottom_shape);
-  convolution_param->add_kernel_size(3);
-  convolution_param->add_stride(2);
-  convolution_param->add_dilation(3);
-  convolution_param->set_force_nd_im2col(true);
-  Im2colLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-2, 1e-2);
-  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-                                  this->blob_top_vec_);
-}
-
-TYPED_TEST(Im2colLayerTest, TestRect) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  ConvolutionParameter* convolution_param =
-      layer_param.mutable_convolution_param();
-  convolution_param->set_kernel_h(5);
-  convolution_param->set_kernel_w(3);
-  convolution_param->add_stride(2);
-  Im2colLayer<Dtype> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  // We are lazy and will only check the top left block
-  for (int c = 0; c < 45; ++c) {
-    EXPECT_EQ(this->blob_top_->data_at(0, c, 0, 0),
-        this->blob_bottom_->data_at(0, (c / 15), (c / 3) % 5, c % 3));
-  }
-}
-
-TYPED_TEST(Im2colLayerTest, TestRectGradient) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  ConvolutionParameter* convolution_param =
-      layer_param.mutable_convolution_param();
-  convolution_param->set_kernel_h(5);
-  convolution_param->set_kernel_w(3);
-  convolution_param->add_stride(2);
-  Im2colLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-2, 1e-2);
-  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_);
-}
-
-}  // namespace caffe
diff --git a/src/caffe/test/test_image_data_layer.cpp b/src/caffe/test/test_image_data_layer.cpp
deleted file mode 100644
index ce5e0bc..0000000
--- a/src/caffe/test/test_image_data_layer.cpp
+++ /dev/null
@@ -1,219 +0,0 @@
-#ifdef USE_OPENCV
-#include <map>
-#include <string>
-#include <vector>
-
-#include "gtest/gtest.h"
-
-#include "caffe/blob.hpp"
-#include "caffe/common.hpp"
-#include "caffe/filler.hpp"
-#include "caffe/layers/image_data_layer.hpp"
-#include "caffe/proto/caffe.pb.h"
-#include "caffe/util/io.hpp"
-
-#include "caffe/test/test_caffe_main.hpp"
-
-namespace caffe {
-
-template <typename TypeParam>
-class ImageDataLayerTest : public MultiDeviceTest<TypeParam> {
-  typedef typename TypeParam::Dtype Dtype;
-
- protected:
-  ImageDataLayerTest()
-      : seed_(1701),
-        blob_top_data_(new Blob<Dtype>()),
-        blob_top_label_(new Blob<Dtype>()) {}
-  virtual void SetUp() {
-    blob_top_vec_.push_back(blob_top_data_);
-    blob_top_vec_.push_back(blob_top_label_);
-    Caffe::set_random_seed(seed_);
-    // Create test input file.
-    MakeTempFilename(&filename_);
-    std::ofstream outfile(filename_.c_str(), std::ofstream::out);
-    LOG(INFO) << "Using temporary file " << filename_;
-    for (int i = 0; i < 5; ++i) {
-      outfile << EXAMPLES_SOURCE_DIR "images/cat.jpg " << i << std::endl;
-    }
-    outfile.close();
-    // Create test input file for images of distinct sizes.
-    MakeTempFilename(&filename_reshape_);
-    std::ofstream reshapefile(filename_reshape_.c_str(), std::ofstream::out);
-    LOG(INFO) << "Using temporary file " << filename_reshape_;
-    reshapefile << EXAMPLES_SOURCE_DIR "images/cat.jpg " << 0 << std::endl;
-    reshapefile << EXAMPLES_SOURCE_DIR "images/fish-bike.jpg " << 1
-                << std::endl;
-    reshapefile.close();
-    // Create test input file for images with space in names
-    MakeTempFilename(&filename_space_);
-    std::ofstream spacefile(filename_space_.c_str(), std::ofstream::out);
-    LOG(INFO) << "Using temporary file " << filename_space_;
-    spacefile << EXAMPLES_SOURCE_DIR "images/cat.jpg " << 0 << std::endl;
-    spacefile << EXAMPLES_SOURCE_DIR "images/cat gray.jpg " << 1 << std::endl;
-    spacefile.close();
-  }
-
-  virtual ~ImageDataLayerTest() {
-    delete blob_top_data_;
-    delete blob_top_label_;
-  }
-
-  int seed_;
-  string filename_;
-  string filename_reshape_;
-  string filename_space_;
-  Blob<Dtype>* const blob_top_data_;
-  Blob<Dtype>* const blob_top_label_;
-  vector<Blob<Dtype>*> blob_bottom_vec_;
-  vector<Blob<Dtype>*> blob_top_vec_;
-};
-
-TYPED_TEST_CASE(ImageDataLayerTest, TestDtypesAndDevices);
-
-TYPED_TEST(ImageDataLayerTest, TestRead) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter param;
-  ImageDataParameter* image_data_param = param.mutable_image_data_param();
-  image_data_param->set_batch_size(5);
-  image_data_param->set_source(this->filename_.c_str());
-  image_data_param->set_shuffle(false);
-  ImageDataLayer<Dtype> layer(param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  EXPECT_EQ(this->blob_top_data_->num(), 5);
-  EXPECT_EQ(this->blob_top_data_->channels(), 3);
-  EXPECT_EQ(this->blob_top_data_->height(), 360);
-  EXPECT_EQ(this->blob_top_data_->width(), 480);
-  EXPECT_EQ(this->blob_top_label_->num(), 5);
-  EXPECT_EQ(this->blob_top_label_->channels(), 1);
-  EXPECT_EQ(this->blob_top_label_->height(), 1);
-  EXPECT_EQ(this->blob_top_label_->width(), 1);
-  // Go through the data twice
-  for (int iter = 0; iter < 2; ++iter) {
-    layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-    for (int i = 0; i < 5; ++i) {
-      EXPECT_EQ(i, this->blob_top_label_->cpu_data()[i]);
-    }
-  }
-}
-
-TYPED_TEST(ImageDataLayerTest, TestResize) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter param;
-  ImageDataParameter* image_data_param = param.mutable_image_data_param();
-  image_data_param->set_batch_size(5);
-  image_data_param->set_source(this->filename_.c_str());
-  image_data_param->set_new_height(256);
-  image_data_param->set_new_width(256);
-  image_data_param->set_shuffle(false);
-  ImageDataLayer<Dtype> layer(param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  EXPECT_EQ(this->blob_top_data_->num(), 5);
-  EXPECT_EQ(this->blob_top_data_->channels(), 3);
-  EXPECT_EQ(this->blob_top_data_->height(), 256);
-  EXPECT_EQ(this->blob_top_data_->width(), 256);
-  EXPECT_EQ(this->blob_top_label_->num(), 5);
-  EXPECT_EQ(this->blob_top_label_->channels(), 1);
-  EXPECT_EQ(this->blob_top_label_->height(), 1);
-  EXPECT_EQ(this->blob_top_label_->width(), 1);
-  // Go through the data twice
-  for (int iter = 0; iter < 2; ++iter) {
-    layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-    for (int i = 0; i < 5; ++i) {
-      EXPECT_EQ(i, this->blob_top_label_->cpu_data()[i]);
-    }
-  }
-}
-
-TYPED_TEST(ImageDataLayerTest, TestReshape) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter param;
-  ImageDataParameter* image_data_param = param.mutable_image_data_param();
-  image_data_param->set_batch_size(1);
-  image_data_param->set_source(this->filename_reshape_.c_str());
-  image_data_param->set_shuffle(false);
-  ImageDataLayer<Dtype> layer(param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  EXPECT_EQ(this->blob_top_label_->num(), 1);
-  EXPECT_EQ(this->blob_top_label_->channels(), 1);
-  EXPECT_EQ(this->blob_top_label_->height(), 1);
-  EXPECT_EQ(this->blob_top_label_->width(), 1);
-  // cat.jpg
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  EXPECT_EQ(this->blob_top_data_->num(), 1);
-  EXPECT_EQ(this->blob_top_data_->channels(), 3);
-  EXPECT_EQ(this->blob_top_data_->height(), 360);
-  EXPECT_EQ(this->blob_top_data_->width(), 480);
-  // fish-bike.jpg
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  EXPECT_EQ(this->blob_top_data_->num(), 1);
-  EXPECT_EQ(this->blob_top_data_->channels(), 3);
-  EXPECT_EQ(this->blob_top_data_->height(), 323);
-  EXPECT_EQ(this->blob_top_data_->width(), 481);
-}
-
-TYPED_TEST(ImageDataLayerTest, TestShuffle) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter param;
-  ImageDataParameter* image_data_param = param.mutable_image_data_param();
-  image_data_param->set_batch_size(5);
-  image_data_param->set_source(this->filename_.c_str());
-  image_data_param->set_shuffle(true);
-  ImageDataLayer<Dtype> layer(param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  EXPECT_EQ(this->blob_top_data_->num(), 5);
-  EXPECT_EQ(this->blob_top_data_->channels(), 3);
-  EXPECT_EQ(this->blob_top_data_->height(), 360);
-  EXPECT_EQ(this->blob_top_data_->width(), 480);
-  EXPECT_EQ(this->blob_top_label_->num(), 5);
-  EXPECT_EQ(this->blob_top_label_->channels(), 1);
-  EXPECT_EQ(this->blob_top_label_->height(), 1);
-  EXPECT_EQ(this->blob_top_label_->width(), 1);
-  // Go through the data twice
-  for (int iter = 0; iter < 2; ++iter) {
-    layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-    map<Dtype, int> values_to_indices;
-    int num_in_order = 0;
-    for (int i = 0; i < 5; ++i) {
-      Dtype value = this->blob_top_label_->cpu_data()[i];
-      // Check that the value has not been seen already (no duplicates).
-      EXPECT_EQ(values_to_indices.find(value), values_to_indices.end());
-      values_to_indices[value] = i;
-      num_in_order += (value == Dtype(i));
-    }
-    EXPECT_EQ(5, values_to_indices.size());
-    EXPECT_GT(5, num_in_order);
-  }
-}
-
-TYPED_TEST(ImageDataLayerTest, TestSpace) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter param;
-  ImageDataParameter* image_data_param = param.mutable_image_data_param();
-  image_data_param->set_batch_size(1);
-  image_data_param->set_source(this->filename_space_.c_str());
-  image_data_param->set_shuffle(false);
-  ImageDataLayer<Dtype> layer(param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  EXPECT_EQ(this->blob_top_label_->num(), 1);
-  EXPECT_EQ(this->blob_top_label_->channels(), 1);
-  EXPECT_EQ(this->blob_top_label_->height(), 1);
-  EXPECT_EQ(this->blob_top_label_->width(), 1);
-  // cat.jpg
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  EXPECT_EQ(this->blob_top_data_->num(), 1);
-  EXPECT_EQ(this->blob_top_data_->channels(), 3);
-  EXPECT_EQ(this->blob_top_data_->height(), 360);
-  EXPECT_EQ(this->blob_top_data_->width(), 480);
-  EXPECT_EQ(this->blob_top_label_->cpu_data()[0], 0);
-  // cat gray.jpg
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  EXPECT_EQ(this->blob_top_data_->num(), 1);
-  EXPECT_EQ(this->blob_top_data_->channels(), 3);
-  EXPECT_EQ(this->blob_top_data_->height(), 360);
-  EXPECT_EQ(this->blob_top_data_->width(), 480);
-  EXPECT_EQ(this->blob_top_label_->cpu_data()[0], 1);
-}
-
-}  // namespace caffe
-#endif  // USE_OPENCV
diff --git a/src/caffe/test/test_infogain_loss_layer.cpp b/src/caffe/test/test_infogain_loss_layer.cpp
deleted file mode 100644
index a24ac68..0000000
--- a/src/caffe/test/test_infogain_loss_layer.cpp
+++ /dev/null
@@ -1,67 +0,0 @@
-#include <vector>
-
-#include "gtest/gtest.h"
-
-#include "caffe/blob.hpp"
-#include "caffe/common.hpp"
-#include "caffe/filler.hpp"
-#include "caffe/layers/infogain_loss_layer.hpp"
-
-#include "caffe/test/test_caffe_main.hpp"
-#include "caffe/test/test_gradient_check_util.hpp"
-
-namespace caffe {
-
-template <typename TypeParam>
-class InfogainLossLayerTest : public MultiDeviceTest<TypeParam> {
-  typedef typename TypeParam::Dtype Dtype;
-
- protected:
-  InfogainLossLayerTest()
-      : blob_bottom_data_(new Blob<Dtype>(10, 5, 1, 1)),
-        blob_bottom_label_(new Blob<Dtype>(10, 1, 1, 1)),
-        blob_bottom_infogain_(new Blob<Dtype>(1, 1, 5, 5)),
-        blob_top_loss_(new Blob<Dtype>()) {
-    Caffe::set_random_seed(1701);
-    FillerParameter filler_param;
-    PositiveUnitballFiller<Dtype> filler(filler_param);
-    filler.Fill(this->blob_bottom_data_);
-    blob_bottom_vec_.push_back(blob_bottom_data_);
-    for (int i = 0; i < blob_bottom_label_->count(); ++i) {
-      blob_bottom_label_->mutable_cpu_data()[i] = caffe_rng_rand() % 5;
-    }
-    blob_bottom_vec_.push_back(blob_bottom_label_);
-    filler_param.set_min(0.1);
-    filler_param.set_max(2.0);
-    UniformFiller<Dtype> infogain_filler(filler_param);
-    infogain_filler.Fill(this->blob_bottom_infogain_);
-    blob_bottom_vec_.push_back(blob_bottom_infogain_);
-    blob_top_vec_.push_back(blob_top_loss_);
-  }
-  virtual ~InfogainLossLayerTest() {
-    delete blob_bottom_data_;
-    delete blob_bottom_label_;
-    delete blob_bottom_infogain_;
-    delete blob_top_loss_;
-  }
-  Blob<Dtype>* const blob_bottom_data_;
-  Blob<Dtype>* const blob_bottom_label_;
-  Blob<Dtype>* const blob_bottom_infogain_;
-  Blob<Dtype>* const blob_top_loss_;
-  vector<Blob<Dtype>*> blob_bottom_vec_;
-  vector<Blob<Dtype>*> blob_top_vec_;
-};
-
-TYPED_TEST_CASE(InfogainLossLayerTest, TestDtypesAndDevices);
-
-
-TYPED_TEST(InfogainLossLayerTest, TestGradient) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  InfogainLossLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-4, 2e-2, 1701, 1, 0.01);
-  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_, 0);
-}
-
-}  // namespace caffe
diff --git a/src/caffe/test/test_inner_product_layer.cpp b/src/caffe/test/test_inner_product_layer.cpp
deleted file mode 100644
index f1ec233..0000000
--- a/src/caffe/test/test_inner_product_layer.cpp
+++ /dev/null
@@ -1,391 +0,0 @@
-#include <vector>
-
-#include "gtest/gtest.h"
-
-#include "caffe/blob.hpp"
-#include "caffe/common.hpp"
-#include "caffe/filler.hpp"
-#include "caffe/layers/inner_product_layer.hpp"
-
-#include "caffe/test/test_caffe_main.hpp"
-#include "caffe/test/test_gradient_check_util.hpp"
-
-namespace caffe {
-
-#ifndef CPU_ONLY
-extern cudaDeviceProp CAFFE_TEST_CUDA_PROP;
-#endif
-
-template <typename TypeParam>
-class InnerProductLayerTest : public MultiDeviceTest<TypeParam> {
-  typedef typename TypeParam::Dtype Dtype;
- protected:
-  InnerProductLayerTest()
-      : blob_bottom_(new Blob<Dtype>(2, 3, 4, 5)),
-        blob_bottom_nobatch_(new Blob<Dtype>(1, 2, 3, 4)),
-        blob_top_(new Blob<Dtype>()) {
-    // fill the values
-    FillerParameter filler_param;
-    UniformFiller<Dtype> filler(filler_param);
-    filler.Fill(this->blob_bottom_);
-    blob_top_vec_.push_back(blob_top_);
-  }
-  virtual ~InnerProductLayerTest() {
-    delete blob_bottom_;
-    delete blob_bottom_nobatch_;
-    delete blob_top_;
-  }
-  Blob<Dtype>* const blob_bottom_;
-  Blob<Dtype>* const blob_bottom_nobatch_;
-  Blob<Dtype>* const blob_top_;
-  vector<Blob<Dtype>*> blob_bottom_vec_;
-  vector<Blob<Dtype>*> blob_top_vec_;
-};
-
-TYPED_TEST_CASE(InnerProductLayerTest, TestDtypesAndDevices);
-
-TYPED_TEST(InnerProductLayerTest, TestSetUp) {
-  typedef typename TypeParam::Dtype Dtype;
-  this->blob_bottom_vec_.push_back(this->blob_bottom_);
-  LayerParameter layer_param;
-  InnerProductParameter* inner_product_param =
-      layer_param.mutable_inner_product_param();
-  inner_product_param->set_num_output(10);
-  shared_ptr<InnerProductLayer<Dtype> > layer(
-      new InnerProductLayer<Dtype>(layer_param));
-  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  EXPECT_EQ(this->blob_top_->num(), 2);
-  EXPECT_EQ(this->blob_top_->height(), 1);
-  EXPECT_EQ(this->blob_top_->width(), 1);
-  EXPECT_EQ(this->blob_top_->channels(), 10);
-}
-
-/** @brief TestSetUp while toggling tranpose flag
- */
-TYPED_TEST(InnerProductLayerTest, TestSetUpTranposeFalse) {
-  typedef typename TypeParam::Dtype Dtype;
-  this->blob_bottom_vec_.push_back(this->blob_bottom_);
-  LayerParameter layer_param;
-  InnerProductParameter* inner_product_param =
-      layer_param.mutable_inner_product_param();
-  inner_product_param->set_num_output(10);
-  inner_product_param->set_transpose(false);
-  shared_ptr<InnerProductLayer<Dtype> > layer(
-      new InnerProductLayer<Dtype>(layer_param));
-  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  EXPECT_EQ(2, this->blob_top_->num());
-  EXPECT_EQ(1, this->blob_top_->height());
-  EXPECT_EQ(1, this->blob_top_->width());
-  EXPECT_EQ(10, this->blob_top_->channels());
-  EXPECT_EQ(2, layer->blobs()[0]->num_axes());
-  EXPECT_EQ(10, layer->blobs()[0]->shape(0));
-  EXPECT_EQ(60, layer->blobs()[0]->shape(1));
-}
-
-/** @brief TestSetUp while toggling tranpose flag
- */
-TYPED_TEST(InnerProductLayerTest, TestSetUpTranposeTrue) {
-  typedef typename TypeParam::Dtype Dtype;
-  this->blob_bottom_vec_.push_back(this->blob_bottom_);
-  LayerParameter layer_param;
-  InnerProductParameter* inner_product_param =
-      layer_param.mutable_inner_product_param();
-  inner_product_param->set_num_output(10);
-  inner_product_param->set_transpose(true);
-  shared_ptr<InnerProductLayer<Dtype> > layer(
-      new InnerProductLayer<Dtype>(layer_param));
-  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  EXPECT_EQ(2, this->blob_top_->num());
-  EXPECT_EQ(1, this->blob_top_->height());
-  EXPECT_EQ(1, this->blob_top_->width());
-  EXPECT_EQ(10, this->blob_top_->channels());
-  EXPECT_EQ(2, layer->blobs()[0]->num_axes());
-  EXPECT_EQ(60, layer->blobs()[0]->shape(0));
-  EXPECT_EQ(10, layer->blobs()[0]->shape(1));
-}
-
-TYPED_TEST(InnerProductLayerTest, TestForward) {
-  typedef typename TypeParam::Dtype Dtype;
-  this->blob_bottom_vec_.push_back(this->blob_bottom_);
-  bool IS_VALID_CUDA = false;
-#ifndef CPU_ONLY
-  IS_VALID_CUDA = CAFFE_TEST_CUDA_PROP.major >= 2;
-#endif
-  if (Caffe::mode() == Caffe::CPU ||
-      sizeof(Dtype) == 4 || IS_VALID_CUDA) {
-    LayerParameter layer_param;
-    InnerProductParameter* inner_product_param =
-        layer_param.mutable_inner_product_param();
-    inner_product_param->set_num_output(10);
-    inner_product_param->mutable_weight_filler()->set_type("uniform");
-    inner_product_param->mutable_bias_filler()->set_type("uniform");
-    inner_product_param->mutable_bias_filler()->set_min(1);
-    inner_product_param->mutable_bias_filler()->set_max(2);
-    shared_ptr<InnerProductLayer<Dtype> > layer(
-        new InnerProductLayer<Dtype>(layer_param));
-    layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-    layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-    const Dtype* data = this->blob_top_->cpu_data();
-    const int count = this->blob_top_->count();
-    for (int i = 0; i < count; ++i) {
-      EXPECT_GE(data[i], 1.);
-    }
-  } else {
-    LOG(ERROR) << "Skipping test due to old architecture.";
-  }
-}
-
-/**
- * @brief Init. an IP layer without transpose + random weights,
- * run Forward, save the result.
- * Init. another IP layer with transpose.
- * manually copy and transpose the weights from the first IP layer,
- * then run Forward on the same input and check that the result is the same
- */
-TYPED_TEST(InnerProductLayerTest, TestForwardTranspose) {
-  typedef typename TypeParam::Dtype Dtype;
-  this->blob_bottom_vec_.push_back(this->blob_bottom_);
-  bool IS_VALID_CUDA = false;
-#ifndef CPU_ONLY
-  IS_VALID_CUDA = CAFFE_TEST_CUDA_PROP.major >= 2;
-#endif
-  if (Caffe::mode() == Caffe::CPU ||
-      sizeof(Dtype) == 4 || IS_VALID_CUDA) {
-    LayerParameter layer_param;
-    InnerProductParameter* inner_product_param =
-        layer_param.mutable_inner_product_param();
-    inner_product_param->set_num_output(10);
-    inner_product_param->mutable_weight_filler()->set_type("uniform");
-    inner_product_param->mutable_bias_filler()->set_type("uniform");
-    inner_product_param->mutable_bias_filler()->set_min(1);
-    inner_product_param->mutable_bias_filler()->set_max(2);
-    inner_product_param->set_transpose(false);
-    shared_ptr<InnerProductLayer<Dtype> > layer(
-        new InnerProductLayer<Dtype>(layer_param));
-    layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-    layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-    const int count = this->blob_top_->count();
-    Blob<Dtype>* const top = new Blob<Dtype>();
-    top->ReshapeLike(*this->blob_top_);
-    caffe_copy(count, this->blob_top_->cpu_data(), top->mutable_cpu_data());
-    this->blob_top_vec_.clear();
-    this->blob_top_vec_.push_back(new Blob<Dtype>());
-    inner_product_param->set_transpose(true);
-    shared_ptr<InnerProductLayer<Dtype> > ip_t(
-        new InnerProductLayer<Dtype>(layer_param));
-    ip_t->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-    const int count_w = layer->blobs()[0]->count();
-    EXPECT_EQ(count_w, ip_t->blobs()[0]->count());
-    // manually copy and transpose the weights from 1st IP layer into 2nd
-    const Dtype* w = layer->blobs()[0]->cpu_data();
-    Dtype* w_t = ip_t->blobs()[0]->mutable_cpu_data();
-    const int width = layer->blobs()[0]->shape(1);
-    const int width_t = ip_t->blobs()[0]->shape(1);
-    for (int i = 0; i < count_w; ++i) {
-      int r = i / width;
-      int c = i % width;
-      w_t[c*width_t+r] = w[r*width+c];  // copy while transposing
-    }
-    // copy bias from 1st IP layer to 2nd IP layer
-    ASSERT_EQ(layer->blobs()[1]->count(), ip_t->blobs()[1]->count());
-    caffe_copy(layer->blobs()[1]->count(), layer->blobs()[1]->cpu_data(),
-        ip_t->blobs()[1]->mutable_cpu_data());
-    ip_t->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-    EXPECT_EQ(count, this->blob_top_->count())
-        << "Invalid count for top blob for IP with transpose.";
-    Blob<Dtype>* const top_t = new Blob<Dtype>();\
-    top_t->ReshapeLike(*this->blob_top_vec_[0]);
-    caffe_copy(count,
-      this->blob_top_vec_[0]->cpu_data(),
-      top_t->mutable_cpu_data());
-    const Dtype* data = top->cpu_data();
-    const Dtype* data_t = top_t->cpu_data();
-    for (int i = 0; i < count; ++i) {
-      EXPECT_FLOAT_EQ(data[i], data_t[i]);
-    }
-  } else {
-    LOG(ERROR) << "Skipping test due to old architecture.";
-  }
-}
-
-TYPED_TEST(InnerProductLayerTest, TestForwardNoBatch) {
-  typedef typename TypeParam::Dtype Dtype;
-  this->blob_bottom_vec_.push_back(this->blob_bottom_nobatch_);
-  bool IS_VALID_CUDA = false;
-#ifndef CPU_ONLY
-  IS_VALID_CUDA = CAFFE_TEST_CUDA_PROP.major >= 2;
-#endif
-  if (Caffe::mode() == Caffe::CPU ||
-      sizeof(Dtype) == 4 || IS_VALID_CUDA) {
-    LayerParameter layer_param;
-    InnerProductParameter* inner_product_param =
-        layer_param.mutable_inner_product_param();
-    inner_product_param->set_num_output(10);
-    inner_product_param->mutable_weight_filler()->set_type("uniform");
-    inner_product_param->mutable_bias_filler()->set_type("uniform");
-    inner_product_param->mutable_bias_filler()->set_min(1);
-    inner_product_param->mutable_bias_filler()->set_max(2);
-    shared_ptr<InnerProductLayer<Dtype> > layer(
-        new InnerProductLayer<Dtype>(layer_param));
-    layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-    layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-    const Dtype* data = this->blob_top_->cpu_data();
-    const int count = this->blob_top_->count();
-    for (int i = 0; i < count; ++i) {
-      EXPECT_GE(data[i], 1.);
-    }
-  } else {
-    LOG(ERROR) << "Skipping test due to old architecture.";
-  }
-}
-
-TYPED_TEST(InnerProductLayerTest, TestGradient) {
-  typedef typename TypeParam::Dtype Dtype;
-  this->blob_bottom_vec_.push_back(this->blob_bottom_);
-  bool IS_VALID_CUDA = false;
-#ifndef CPU_ONLY
-  IS_VALID_CUDA = CAFFE_TEST_CUDA_PROP.major >= 2;
-#endif
-  if (Caffe::mode() == Caffe::CPU ||
-      sizeof(Dtype) == 4 || IS_VALID_CUDA) {
-    LayerParameter layer_param;
-    InnerProductParameter* inner_product_param =
-        layer_param.mutable_inner_product_param();
-    inner_product_param->set_num_output(10);
-    inner_product_param->mutable_weight_filler()->set_type("gaussian");
-    inner_product_param->mutable_bias_filler()->set_type("gaussian");
-    inner_product_param->mutable_bias_filler()->set_min(1);
-    inner_product_param->mutable_bias_filler()->set_max(2);
-    InnerProductLayer<Dtype> layer(layer_param);
-    GradientChecker<Dtype> checker(1e-2, 1e-3);
-    checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-        this->blob_top_vec_);
-  } else {
-    LOG(ERROR) << "Skipping test due to old architecture.";
-  }
-}
-
-TYPED_TEST(InnerProductLayerTest, TestGradientTranspose) {
-  typedef typename TypeParam::Dtype Dtype;
-  this->blob_bottom_vec_.push_back(this->blob_bottom_);
-  bool IS_VALID_CUDA = false;
-#ifndef CPU_ONLY
-  IS_VALID_CUDA = CAFFE_TEST_CUDA_PROP.major >= 2;
-#endif
-  if (Caffe::mode() == Caffe::CPU ||
-      sizeof(Dtype) == 4 || IS_VALID_CUDA) {
-    LayerParameter layer_param;
-    InnerProductParameter* inner_product_param =
-        layer_param.mutable_inner_product_param();
-    inner_product_param->set_num_output(11);
-    inner_product_param->mutable_weight_filler()->set_type("gaussian");
-    inner_product_param->mutable_bias_filler()->set_type("gaussian");
-    inner_product_param->mutable_bias_filler()->set_min(1);
-    inner_product_param->mutable_bias_filler()->set_max(2);
-    inner_product_param->set_transpose(true);
-    InnerProductLayer<Dtype> layer(layer_param);
-    GradientChecker<Dtype> checker(1e-2, 1e-3);
-    checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-        this->blob_top_vec_);
-  } else {
-    LOG(ERROR) << "Skipping test due to old architecture.";
-  }
-}
-
-TYPED_TEST(InnerProductLayerTest, TestBackwardTranspose) {
-  typedef typename TypeParam::Dtype Dtype;
-  this->blob_bottom_vec_.push_back(this->blob_bottom_);
-  bool IS_VALID_CUDA = false;
-#ifndef CPU_ONLY
-  IS_VALID_CUDA = CAFFE_TEST_CUDA_PROP.major >= 2;
-#endif
-  if (Caffe::mode() == Caffe::CPU ||
-      sizeof(Dtype) == 4 || IS_VALID_CUDA) {
-    LayerParameter layer_param;
-    InnerProductParameter* inner_product_param =
-        layer_param.mutable_inner_product_param();
-    inner_product_param->set_num_output(10);
-    inner_product_param->mutable_weight_filler()->set_type("uniform");
-    inner_product_param->mutable_bias_filler()->set_type("uniform");
-    inner_product_param->mutable_bias_filler()->set_min(1);
-    inner_product_param->mutable_bias_filler()->set_max(2);
-    inner_product_param->set_transpose(false);
-    shared_ptr<InnerProductLayer<Dtype> > layer(
-        new InnerProductLayer<Dtype>(layer_param));
-    layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-    layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-    // copy top blob
-    Blob<Dtype>* const top = new Blob<Dtype>();
-    top->CopyFrom(*this->blob_top_, false, true);
-    // fake top diff
-    Blob<Dtype>* const diff = new Blob<Dtype>();
-    diff->ReshapeLike(*this->blob_top_);
-    {
-      FillerParameter filler_param;
-      UniformFiller<Dtype> filler(filler_param);
-      filler.Fill(diff);
-    }
-    caffe_copy(this->blob_top_vec_[0]->count(),
-      diff->cpu_data(),
-      this->blob_top_vec_[0]->mutable_cpu_diff());
-    vector<bool> propagate_down(1, true);
-    layer->Backward(this->blob_top_vec_,
-        propagate_down,
-        this->blob_bottom_vec_);
-    // copy first ip's weights and their diffs
-    Blob<Dtype>* const w = new Blob<Dtype>();
-    w->CopyFrom(*layer->blobs()[0], false, true);
-    w->CopyFrom(*layer->blobs()[0], true, true);
-    // copy bottom diffs
-    Blob<Dtype>* const bottom_diff = new Blob<Dtype>();
-    bottom_diff->CopyFrom(*this->blob_bottom_vec_[0], true, true);
-    // repeat original top with tranposed ip
-    this->blob_top_vec_.clear();
-    this->blob_top_vec_.push_back(new Blob<Dtype>());
-    inner_product_param->set_transpose(true);
-    shared_ptr<InnerProductLayer<Dtype> > ip_t(
-        new InnerProductLayer<Dtype>(layer_param));
-    ip_t->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-    // manually copy and transpose the weights from 1st IP layer into 2nd
-    {
-      const Dtype* w_src = w->cpu_data();
-      Dtype* w_t = ip_t->blobs()[0]->mutable_cpu_data();
-      const int width = layer->blobs()[0]->shape(1);
-      const int width_t = ip_t->blobs()[0]->shape(1);
-      for (int i = 0; i < layer->blobs()[0]->count(); ++i) {
-        int r = i / width;
-        int c = i % width;
-        w_t[c*width_t+r] = w_src[r*width+c];  // copy while transposing
-      }
-      // copy bias from 1st IP layer to 2nd IP layer
-      ASSERT_EQ(layer->blobs()[1]->count(), ip_t->blobs()[1]->count());
-      caffe_copy(layer->blobs()[1]->count(), layer->blobs()[1]->cpu_data(),
-          ip_t->blobs()[1]->mutable_cpu_data());
-    }
-    ip_t->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-    caffe_copy(this->blob_top_vec_[0]->count(),
-      diff->cpu_data(),
-      this->blob_top_vec_[0]->mutable_cpu_diff());
-    ip_t->Backward(this->blob_top_vec_, propagate_down, this->blob_bottom_vec_);
-    const Dtype* data = w->cpu_diff();
-    const Dtype* data_t = ip_t->blobs()[0]->cpu_diff();
-    const int WIDTH = layer->blobs()[0]->shape(1);
-    const int WIDTH_T = ip_t->blobs()[0]->shape(1);
-    for (int i = 0; i < layer->blobs()[0]->count(); ++i) {
-      int r = i / WIDTH;
-      int c = i % WIDTH;
-      EXPECT_NE(Dtype(0.), data[r*WIDTH+c]);
-      EXPECT_FLOAT_EQ(data[r*WIDTH+c], data_t[c*WIDTH_T+r]);
-    }
-    data = bottom_diff->cpu_diff();
-    data_t = this->blob_bottom_vec_[0]->cpu_diff();
-    for (int i = 0; i < this->blob_bottom_vec_[0]->count(); ++i) {
-      EXPECT_NE(Dtype(0.), data[i]);
-      EXPECT_FLOAT_EQ(data[i], data_t[i]);
-    }
-  } else {
-    LOG(ERROR) << "Skipping test due to old architecture.";
-  }
-}
-
-}  // namespace caffe
diff --git a/src/caffe/test/test_internal_thread.cpp b/src/caffe/test/test_internal_thread.cpp
deleted file mode 100644
index 93f1cc5..0000000
--- a/src/caffe/test/test_internal_thread.cpp
+++ /dev/null
@@ -1,53 +0,0 @@
-#include "glog/logging.h"
-#include "gtest/gtest.h"
-
-#include "caffe/internal_thread.hpp"
-#include "caffe/util/math_functions.hpp"
-
-#include "caffe/test/test_caffe_main.hpp"
-
-namespace caffe {
-
-
-class InternalThreadTest : public ::testing::Test {};
-
-TEST_F(InternalThreadTest, TestStartAndExit) {
-  InternalThread thread;
-  EXPECT_FALSE(thread.is_started());
-  thread.StartInternalThread();
-  EXPECT_TRUE(thread.is_started());
-  thread.StopInternalThread();
-  EXPECT_FALSE(thread.is_started());
-}
-
-class TestThreadA : public InternalThread {
-  void InternalThreadEntry() {
-    EXPECT_EQ(4244559767, caffe_rng_rand());
-  }
-};
-
-class TestThreadB : public InternalThread {
-  void InternalThreadEntry() {
-    EXPECT_EQ(1726478280, caffe_rng_rand());
-  }
-};
-
-TEST_F(InternalThreadTest, TestRandomSeed) {
-  TestThreadA t1;
-  Caffe::set_random_seed(9658361);
-  t1.StartInternalThread();
-  t1.StopInternalThread();
-
-  TestThreadA t2;
-  Caffe::set_random_seed(9658361);
-  t2.StartInternalThread();
-  t2.StopInternalThread();
-
-  TestThreadB t3;
-  Caffe::set_random_seed(3435563);
-  t3.StartInternalThread();
-  t3.StopInternalThread();
-}
-
-}  // namespace caffe
-
diff --git a/src/caffe/test/test_io.cpp b/src/caffe/test/test_io.cpp
deleted file mode 100644
index c2c919e..0000000
--- a/src/caffe/test/test_io.cpp
+++ /dev/null
@@ -1,424 +0,0 @@
-#ifdef USE_OPENCV
-#include <opencv2/core/core.hpp>
-#include <opencv2/highgui/highgui.hpp>
-#include <opencv2/highgui/highgui_c.h>
-#include <opencv2/imgproc/imgproc.hpp>
-
-#include <string>
-
-#include "gtest/gtest.h"
-
-#include "caffe/common.hpp"
-#include "caffe/util/io.hpp"
-
-#include "caffe/test/test_caffe_main.hpp"
-
-namespace caffe {
-
-class IOTest : public ::testing::Test {};
-
-bool ReadImageToDatumReference(const string& filename, const int label,
-    const int height, const int width, const bool is_color, Datum* datum) {
-  cv::Mat cv_img;
-  int cv_read_flag = (is_color ? CV_LOAD_IMAGE_COLOR :
-    CV_LOAD_IMAGE_GRAYSCALE);
-
-  cv::Mat cv_img_origin = cv::imread(filename, cv_read_flag);
-  if (!cv_img_origin.data) {
-    LOG(ERROR) << "Could not open or find file " << filename;
-    return false;
-  }
-  if (height > 0 && width > 0) {
-    cv::resize(cv_img_origin, cv_img, cv::Size(width, height));
-  } else {
-    cv_img = cv_img_origin;
-  }
-
-  int num_channels = (is_color ? 3 : 1);
-  datum->set_channels(num_channels);
-  datum->set_height(cv_img.rows);
-  datum->set_width(cv_img.cols);
-  datum->set_label(label);
-  datum->clear_data();
-  datum->clear_float_data();
-  string* datum_string = datum->mutable_data();
-  if (is_color) {
-    for (int c = 0; c < num_channels; ++c) {
-      for (int h = 0; h < cv_img.rows; ++h) {
-        for (int w = 0; w < cv_img.cols; ++w) {
-          datum_string->push_back(
-            static_cast<char>(cv_img.at<cv::Vec3b>(h, w)[c]));
-        }
-      }
-    }
-  } else {  // Faster than repeatedly testing is_color for each pixel w/i loop
-    for (int h = 0; h < cv_img.rows; ++h) {
-      for (int w = 0; w < cv_img.cols; ++w) {
-        datum_string->push_back(
-          static_cast<char>(cv_img.at<uchar>(h, w)));
-        }
-      }
-  }
-  return true;
-}
-
-TEST_F(IOTest, TestReadImageToDatum) {
-  string filename = EXAMPLES_SOURCE_DIR "images/cat.jpg";
-  Datum datum;
-  ReadImageToDatum(filename, 0, &datum);
-  EXPECT_EQ(datum.channels(), 3);
-  EXPECT_EQ(datum.height(), 360);
-  EXPECT_EQ(datum.width(), 480);
-}
-
-TEST_F(IOTest, TestReadImageToDatumReference) {
-  string filename = EXAMPLES_SOURCE_DIR "images/cat.jpg";
-  Datum datum, datum_ref;
-  ReadImageToDatum(filename, 0, 0, 0, true, &datum);
-  ReadImageToDatumReference(filename, 0, 0, 0, true, &datum_ref);
-  EXPECT_EQ(datum.channels(), datum_ref.channels());
-  EXPECT_EQ(datum.height(), datum_ref.height());
-  EXPECT_EQ(datum.width(), datum_ref.width());
-  EXPECT_EQ(datum.data().size(), datum_ref.data().size());
-
-  const string& data = datum.data();
-  const string& data_ref = datum.data();
-
-  for (int i = 0; i < datum.data().size(); ++i) {
-    EXPECT_TRUE(data[i] == data_ref[i]);
-  }
-}
-
-
-TEST_F(IOTest, TestReadImageToDatumReferenceResized) {
-  string filename = EXAMPLES_SOURCE_DIR "images/cat.jpg";
-  Datum datum, datum_ref;
-  ReadImageToDatum(filename, 0, 100, 200, true, &datum);
-  ReadImageToDatumReference(filename, 0, 100, 200, true, &datum_ref);
-  EXPECT_EQ(datum.channels(), datum_ref.channels());
-  EXPECT_EQ(datum.height(), datum_ref.height());
-  EXPECT_EQ(datum.width(), datum_ref.width());
-  EXPECT_EQ(datum.data().size(), datum_ref.data().size());
-
-  const string& data = datum.data();
-  const string& data_ref = datum.data();
-
-  for (int i = 0; i < datum.data().size(); ++i) {
-    EXPECT_TRUE(data[i] == data_ref[i]);
-  }
-}
-
-TEST_F(IOTest, TestReadImageToDatumContent) {
-  string filename = EXAMPLES_SOURCE_DIR "images/cat.jpg";
-  Datum datum;
-  ReadImageToDatum(filename, 0, &datum);
-  cv::Mat cv_img = ReadImageToCVMat(filename);
-  EXPECT_EQ(datum.channels(), cv_img.channels());
-  EXPECT_EQ(datum.height(), cv_img.rows);
-  EXPECT_EQ(datum.width(), cv_img.cols);
-
-  const string& data = datum.data();
-  int index = 0;
-  for (int c = 0; c < datum.channels(); ++c) {
-    for (int h = 0; h < datum.height(); ++h) {
-      for (int w = 0; w < datum.width(); ++w) {
-        EXPECT_TRUE(data[index++] ==
-          static_cast<char>(cv_img.at<cv::Vec3b>(h, w)[c]));
-      }
-    }
-  }
-}
-
-TEST_F(IOTest, TestReadImageToDatumContentGray) {
-  string filename = EXAMPLES_SOURCE_DIR "images/cat.jpg";
-  Datum datum;
-  const bool is_color = false;
-  ReadImageToDatum(filename, 0, is_color, &datum);
-  cv::Mat cv_img = ReadImageToCVMat(filename, is_color);
-  EXPECT_EQ(datum.channels(), cv_img.channels());
-  EXPECT_EQ(datum.height(), cv_img.rows);
-  EXPECT_EQ(datum.width(), cv_img.cols);
-
-  const string& data = datum.data();
-  int index = 0;
-  for (int h = 0; h < datum.height(); ++h) {
-    for (int w = 0; w < datum.width(); ++w) {
-      EXPECT_TRUE(data[index++] == static_cast<char>(cv_img.at<uchar>(h, w)));
-    }
-  }
-}
-
-TEST_F(IOTest, TestReadImageToDatumResized) {
-  string filename = EXAMPLES_SOURCE_DIR "images/cat.jpg";
-  Datum datum;
-  ReadImageToDatum(filename, 0, 100, 200, &datum);
-  EXPECT_EQ(datum.channels(), 3);
-  EXPECT_EQ(datum.height(), 100);
-  EXPECT_EQ(datum.width(), 200);
-}
-
-
-TEST_F(IOTest, TestReadImageToDatumResizedSquare) {
-  string filename = EXAMPLES_SOURCE_DIR "images/cat.jpg";
-  Datum datum;
-  ReadImageToDatum(filename, 0, 256, 256, &datum);
-  EXPECT_EQ(datum.channels(), 3);
-  EXPECT_EQ(datum.height(), 256);
-  EXPECT_EQ(datum.width(), 256);
-}
-
-TEST_F(IOTest, TestReadImageToDatumGray) {
-  string filename = EXAMPLES_SOURCE_DIR "images/cat.jpg";
-  Datum datum;
-  const bool is_color = false;
-  ReadImageToDatum(filename, 0, is_color, &datum);
-  EXPECT_EQ(datum.channels(), 1);
-  EXPECT_EQ(datum.height(), 360);
-  EXPECT_EQ(datum.width(), 480);
-}
-
-TEST_F(IOTest, TestReadImageToDatumResizedGray) {
-  string filename = EXAMPLES_SOURCE_DIR "images/cat.jpg";
-  Datum datum;
-  const bool is_color = false;
-  ReadImageToDatum(filename, 0, 256, 256, is_color, &datum);
-  EXPECT_EQ(datum.channels(), 1);
-  EXPECT_EQ(datum.height(), 256);
-  EXPECT_EQ(datum.width(), 256);
-}
-
-TEST_F(IOTest, TestReadImageToCVMat) {
-  string filename = EXAMPLES_SOURCE_DIR "images/cat.jpg";
-  cv::Mat cv_img = ReadImageToCVMat(filename);
-  EXPECT_EQ(cv_img.channels(), 3);
-  EXPECT_EQ(cv_img.rows, 360);
-  EXPECT_EQ(cv_img.cols, 480);
-}
-
-TEST_F(IOTest, TestReadImageToCVMatResized) {
-  string filename = EXAMPLES_SOURCE_DIR "images/cat.jpg";
-  cv::Mat cv_img = ReadImageToCVMat(filename, 100, 200);
-  EXPECT_EQ(cv_img.channels(), 3);
-  EXPECT_EQ(cv_img.rows, 100);
-  EXPECT_EQ(cv_img.cols, 200);
-}
-
-TEST_F(IOTest, TestReadImageToCVMatResizedSquare) {
-  string filename = EXAMPLES_SOURCE_DIR "images/cat.jpg";
-  cv::Mat cv_img = ReadImageToCVMat(filename, 256, 256);
-  EXPECT_EQ(cv_img.channels(), 3);
-  EXPECT_EQ(cv_img.rows, 256);
-  EXPECT_EQ(cv_img.cols, 256);
-}
-
-TEST_F(IOTest, TestReadImageToCVMatGray) {
-  string filename = EXAMPLES_SOURCE_DIR "images/cat.jpg";
-  const bool is_color = false;
-  cv::Mat cv_img = ReadImageToCVMat(filename, is_color);
-  EXPECT_EQ(cv_img.channels(), 1);
-  EXPECT_EQ(cv_img.rows, 360);
-  EXPECT_EQ(cv_img.cols, 480);
-}
-
-TEST_F(IOTest, TestReadImageToCVMatResizedGray) {
-  string filename = EXAMPLES_SOURCE_DIR "images/cat.jpg";
-  const bool is_color = false;
-  cv::Mat cv_img = ReadImageToCVMat(filename, 256, 256, is_color);
-  EXPECT_EQ(cv_img.channels(), 1);
-  EXPECT_EQ(cv_img.rows, 256);
-  EXPECT_EQ(cv_img.cols, 256);
-}
-
-TEST_F(IOTest, TestCVMatToDatum) {
-  string filename = EXAMPLES_SOURCE_DIR "images/cat.jpg";
-  cv::Mat cv_img = ReadImageToCVMat(filename);
-  Datum datum;
-  CVMatToDatum(cv_img, &datum);
-  EXPECT_EQ(datum.channels(), 3);
-  EXPECT_EQ(datum.height(), 360);
-  EXPECT_EQ(datum.width(), 480);
-}
-
-TEST_F(IOTest, TestCVMatToDatumContent) {
-  string filename = EXAMPLES_SOURCE_DIR "images/cat.jpg";
-  cv::Mat cv_img = ReadImageToCVMat(filename);
-  Datum datum;
-  CVMatToDatum(cv_img, &datum);
-  Datum datum_ref;
-  ReadImageToDatum(filename, 0, &datum_ref);
-  EXPECT_EQ(datum.channels(), datum_ref.channels());
-  EXPECT_EQ(datum.height(), datum_ref.height());
-  EXPECT_EQ(datum.width(), datum_ref.width());
-  EXPECT_EQ(datum.data().size(), datum_ref.data().size());
-
-  const string& data = datum.data();
-  const string& data_ref = datum_ref.data();
-  for (int i = 0; i < datum.data().size(); ++i) {
-    EXPECT_TRUE(data[i] == data_ref[i]);
-  }
-}
-
-TEST_F(IOTest, TestCVMatToDatumReference) {
-  string filename = EXAMPLES_SOURCE_DIR "images/cat.jpg";
-  cv::Mat cv_img = ReadImageToCVMat(filename);
-  Datum datum;
-  CVMatToDatum(cv_img, &datum);
-  Datum datum_ref;
-  ReadImageToDatumReference(filename, 0, 0, 0, true, &datum_ref);
-  EXPECT_EQ(datum.channels(), datum_ref.channels());
-  EXPECT_EQ(datum.height(), datum_ref.height());
-  EXPECT_EQ(datum.width(), datum_ref.width());
-  EXPECT_EQ(datum.data().size(), datum_ref.data().size());
-
-  const string& data = datum.data();
-  const string& data_ref = datum_ref.data();
-  for (int i = 0; i < datum.data().size(); ++i) {
-    EXPECT_TRUE(data[i] == data_ref[i]);
-  }
-}
-
-TEST_F(IOTest, TestReadFileToDatum) {
-  string filename = EXAMPLES_SOURCE_DIR "images/cat.jpg";
-  Datum datum;
-  EXPECT_TRUE(ReadFileToDatum(filename, &datum));
-  EXPECT_TRUE(datum.encoded());
-  EXPECT_EQ(datum.label(), -1);
-  EXPECT_EQ(datum.data().size(), 140391);
-}
-
-TEST_F(IOTest, TestDecodeDatum) {
-  string filename = EXAMPLES_SOURCE_DIR "images/cat.jpg";
-  Datum datum;
-  EXPECT_TRUE(ReadFileToDatum(filename, &datum));
-  EXPECT_TRUE(DecodeDatum(&datum, true));
-  EXPECT_FALSE(DecodeDatum(&datum, true));
-  Datum datum_ref;
-  ReadImageToDatumReference(filename, 0, 0, 0, true, &datum_ref);
-  EXPECT_EQ(datum.channels(), datum_ref.channels());
-  EXPECT_EQ(datum.height(), datum_ref.height());
-  EXPECT_EQ(datum.width(), datum_ref.width());
-  EXPECT_EQ(datum.data().size(), datum_ref.data().size());
-
-  const string& data = datum.data();
-  const string& data_ref = datum_ref.data();
-  for (int i = 0; i < datum.data().size(); ++i) {
-    EXPECT_TRUE(data[i] == data_ref[i]);
-  }
-}
-
-TEST_F(IOTest, TestDecodeDatumToCVMat) {
-  string filename = EXAMPLES_SOURCE_DIR "images/cat.jpg";
-  Datum datum;
-  EXPECT_TRUE(ReadFileToDatum(filename, &datum));
-  cv::Mat cv_img = DecodeDatumToCVMat(datum, true);
-  EXPECT_EQ(cv_img.channels(), 3);
-  EXPECT_EQ(cv_img.rows, 360);
-  EXPECT_EQ(cv_img.cols, 480);
-  cv_img = DecodeDatumToCVMat(datum, false);
-  EXPECT_EQ(cv_img.channels(), 1);
-  EXPECT_EQ(cv_img.rows, 360);
-  EXPECT_EQ(cv_img.cols, 480);
-}
-
-TEST_F(IOTest, TestDecodeDatumToCVMatContent) {
-  string filename = EXAMPLES_SOURCE_DIR "images/cat.jpg";
-  Datum datum;
-  EXPECT_TRUE(ReadImageToDatum(filename, 0, std::string("jpg"), &datum));
-  cv::Mat cv_img = DecodeDatumToCVMat(datum, true);
-  cv::Mat cv_img_ref = ReadImageToCVMat(filename);
-  EXPECT_EQ(cv_img_ref.channels(), cv_img.channels());
-  EXPECT_EQ(cv_img_ref.rows, cv_img.rows);
-  EXPECT_EQ(cv_img_ref.cols, cv_img.cols);
-
-  for (int c = 0; c < datum.channels(); ++c) {
-    for (int h = 0; h < datum.height(); ++h) {
-      for (int w = 0; w < datum.width(); ++w) {
-        EXPECT_TRUE(cv_img.at<cv::Vec3b>(h, w)[c]==
-          cv_img_ref.at<cv::Vec3b>(h, w)[c]);
-      }
-    }
-  }
-}
-
-TEST_F(IOTest, TestDecodeDatumNative) {
-  string filename = EXAMPLES_SOURCE_DIR "images/cat.jpg";
-  Datum datum;
-  EXPECT_TRUE(ReadFileToDatum(filename, &datum));
-  EXPECT_TRUE(DecodeDatumNative(&datum));
-  EXPECT_FALSE(DecodeDatumNative(&datum));
-  Datum datum_ref;
-  ReadImageToDatumReference(filename, 0, 0, 0, true, &datum_ref);
-  EXPECT_EQ(datum.channels(), datum_ref.channels());
-  EXPECT_EQ(datum.height(), datum_ref.height());
-  EXPECT_EQ(datum.width(), datum_ref.width());
-  EXPECT_EQ(datum.data().size(), datum_ref.data().size());
-
-  const string& data = datum.data();
-  const string& data_ref = datum_ref.data();
-  for (int i = 0; i < datum.data().size(); ++i) {
-    EXPECT_TRUE(data[i] == data_ref[i]);
-  }
-}
-
-TEST_F(IOTest, TestDecodeDatumToCVMatNative) {
-  string filename = EXAMPLES_SOURCE_DIR "images/cat.jpg";
-  Datum datum;
-  EXPECT_TRUE(ReadFileToDatum(filename, &datum));
-  cv::Mat cv_img = DecodeDatumToCVMatNative(datum);
-  EXPECT_EQ(cv_img.channels(), 3);
-  EXPECT_EQ(cv_img.rows, 360);
-  EXPECT_EQ(cv_img.cols, 480);
-}
-
-TEST_F(IOTest, TestDecodeDatumNativeGray) {
-  string filename = EXAMPLES_SOURCE_DIR "images/cat_gray.jpg";
-  Datum datum;
-  EXPECT_TRUE(ReadFileToDatum(filename, &datum));
-  EXPECT_TRUE(DecodeDatumNative(&datum));
-  EXPECT_FALSE(DecodeDatumNative(&datum));
-  Datum datum_ref;
-  ReadImageToDatumReference(filename, 0, 0, 0, false, &datum_ref);
-  EXPECT_EQ(datum.channels(), datum_ref.channels());
-  EXPECT_EQ(datum.height(), datum_ref.height());
-  EXPECT_EQ(datum.width(), datum_ref.width());
-  EXPECT_EQ(datum.data().size(), datum_ref.data().size());
-
-  const string& data = datum.data();
-  const string& data_ref = datum_ref.data();
-  for (int i = 0; i < datum.data().size(); ++i) {
-    EXPECT_TRUE(data[i] == data_ref[i]);
-  }
-}
-
-TEST_F(IOTest, TestDecodeDatumToCVMatNativeGray) {
-  string filename = EXAMPLES_SOURCE_DIR "images/cat_gray.jpg";
-  Datum datum;
-  EXPECT_TRUE(ReadFileToDatum(filename, &datum));
-  cv::Mat cv_img = DecodeDatumToCVMatNative(datum);
-  EXPECT_EQ(cv_img.channels(), 1);
-  EXPECT_EQ(cv_img.rows, 360);
-  EXPECT_EQ(cv_img.cols, 480);
-}
-
-TEST_F(IOTest, TestDecodeDatumToCVMatContentNative) {
-  string filename = EXAMPLES_SOURCE_DIR "images/cat.jpg";
-  Datum datum;
-  EXPECT_TRUE(ReadImageToDatum(filename, 0, std::string("jpg"), &datum));
-  cv::Mat cv_img = DecodeDatumToCVMatNative(datum);
-  cv::Mat cv_img_ref = ReadImageToCVMat(filename);
-  EXPECT_EQ(cv_img_ref.channels(), cv_img.channels());
-  EXPECT_EQ(cv_img_ref.rows, cv_img.rows);
-  EXPECT_EQ(cv_img_ref.cols, cv_img.cols);
-
-  for (int c = 0; c < datum.channels(); ++c) {
-    for (int h = 0; h < datum.height(); ++h) {
-      for (int w = 0; w < datum.width(); ++w) {
-        EXPECT_TRUE(cv_img.at<cv::Vec3b>(h, w)[c]==
-          cv_img_ref.at<cv::Vec3b>(h, w)[c]);
-      }
-    }
-  }
-}
-
-}  // namespace caffe
-#endif  // USE_OPENCV
diff --git a/src/caffe/test/test_layer_factory.cpp b/src/caffe/test/test_layer_factory.cpp
deleted file mode 100644
index 7d5d39d..0000000
--- a/src/caffe/test/test_layer_factory.cpp
+++ /dev/null
@@ -1,51 +0,0 @@
-#include <map>
-#include <string>
-
-#include "boost/scoped_ptr.hpp"
-#include "gtest/gtest.h"
-
-#include "caffe/common.hpp"
-#include "caffe/layer.hpp"
-#include "caffe/layer_factory.hpp"
-#include "caffe/util/db.hpp"
-#include "caffe/util/io.hpp"
-
-#include "caffe/test/test_caffe_main.hpp"
-
-namespace caffe {
-
-template <typename TypeParam>
-class LayerFactoryTest : public MultiDeviceTest<TypeParam> {};
-
-TYPED_TEST_CASE(LayerFactoryTest, TestDtypesAndDevices);
-
-TYPED_TEST(LayerFactoryTest, TestCreateLayer) {
-  typedef typename TypeParam::Dtype Dtype;
-  typename LayerRegistry<Dtype>::CreatorRegistry& registry =
-      LayerRegistry<Dtype>::Registry();
-  shared_ptr<Layer<Dtype> > layer;
-  for (typename LayerRegistry<Dtype>::CreatorRegistry::iterator iter =
-       registry.begin(); iter != registry.end(); ++iter) {
-    // Special case: PythonLayer is checked by pytest
-    if (iter->first == "Python") { continue; }
-    LayerParameter layer_param;
-    // Data layers expect a DB
-    if (iter->first == "Data") {
-#ifdef USE_LEVELDB
-      string tmp;
-      MakeTempDir(&tmp);
-      boost::scoped_ptr<db::DB> db(db::GetDB(DataParameter_DB_LEVELDB));
-      db->Open(tmp, db::NEW);
-      db->Close();
-      layer_param.mutable_data_param()->set_source(tmp);
-#else
-      continue;
-#endif  // USE_LEVELDB
-    }
-    layer_param.set_type(iter->first);
-    layer = LayerRegistry<Dtype>::CreateLayer(layer_param);
-    EXPECT_EQ(iter->first, layer->type());
-  }
-}
-
-}  // namespace caffe
diff --git a/src/caffe/test/test_lrn_layer.cpp b/src/caffe/test/test_lrn_layer.cpp
deleted file mode 100644
index 4c97b1a..0000000
--- a/src/caffe/test/test_lrn_layer.cpp
+++ /dev/null
@@ -1,450 +0,0 @@
-#include <algorithm>
-#include <vector>
-
-#include "gtest/gtest.h"
-
-#include "caffe/blob.hpp"
-#include "caffe/common.hpp"
-#include "caffe/filler.hpp"
-#include "caffe/layers/lrn_layer.hpp"
-
-#ifdef USE_CUDNN
-#include "caffe/layers/cudnn_lcn_layer.hpp"
-#include "caffe/layers/cudnn_lrn_layer.hpp"
-#endif
-
-#include "caffe/test/test_caffe_main.hpp"
-#include "caffe/test/test_gradient_check_util.hpp"
-
-using std::min;
-using std::max;
-
-namespace caffe {
-
-template <typename TypeParam>
-class LRNLayerTest : public MultiDeviceTest<TypeParam> {
-  typedef typename TypeParam::Dtype Dtype;
-
- protected:
-  LRNLayerTest()
-      : epsilon_(Dtype(1e-5)),
-        blob_bottom_(new Blob<Dtype>()),
-        blob_top_(new Blob<Dtype>()) {}
-  virtual void SetUp() {
-    Caffe::set_random_seed(1701);
-    blob_bottom_->Reshape(2, 7, 3, 3);
-    // fill the values
-    FillerParameter filler_param;
-    GaussianFiller<Dtype> filler(filler_param);
-    filler.Fill(this->blob_bottom_);
-    blob_bottom_vec_.push_back(blob_bottom_);
-    blob_top_vec_.push_back(blob_top_);
-  }
-  virtual ~LRNLayerTest() { delete blob_bottom_; delete blob_top_; }
-  void ReferenceLRNForward(const Blob<Dtype>& blob_bottom,
-      const LayerParameter& layer_param, Blob<Dtype>* blob_top);
-
-  Dtype epsilon_;
-  Blob<Dtype>* const blob_bottom_;
-  Blob<Dtype>* const blob_top_;
-  vector<Blob<Dtype>*> blob_bottom_vec_;
-  vector<Blob<Dtype>*> blob_top_vec_;
-};
-
-template <typename TypeParam>
-void LRNLayerTest<TypeParam>::ReferenceLRNForward(
-    const Blob<Dtype>& blob_bottom, const LayerParameter& layer_param,
-    Blob<Dtype>* blob_top) {
-  typedef typename TypeParam::Dtype Dtype;
-  blob_top->Reshape(blob_bottom.num(), blob_bottom.channels(),
-      blob_bottom.height(), blob_bottom.width());
-  Dtype* top_data = blob_top->mutable_cpu_data();
-  LRNParameter lrn_param = layer_param.lrn_param();
-  Dtype alpha = lrn_param.alpha();
-  Dtype beta = lrn_param.beta();
-  int size = lrn_param.local_size();
-  switch (lrn_param.norm_region()) {
-  case LRNParameter_NormRegion_ACROSS_CHANNELS:
-    for (int n = 0; n < blob_bottom.num(); ++n) {
-      for (int c = 0; c < blob_bottom.channels(); ++c) {
-        for (int h = 0; h < blob_bottom.height(); ++h) {
-          for (int w = 0; w < blob_bottom.width(); ++w) {
-            int c_start = c - (size - 1) / 2;
-            int c_end = min(c_start + size, blob_bottom.channels());
-            c_start = max(c_start, 0);
-            Dtype scale = 1.;
-            for (int i = c_start; i < c_end; ++i) {
-              Dtype value = blob_bottom.data_at(n, i, h, w);
-              scale += value * value * alpha / size;
-            }
-            *(top_data + blob_top->offset(n, c, h, w)) =
-              blob_bottom.data_at(n, c, h, w) / pow(scale, beta);
-          }
-        }
-      }
-    }
-    break;
-  case LRNParameter_NormRegion_WITHIN_CHANNEL:
-    for (int n = 0; n < blob_bottom.num(); ++n) {
-      for (int c = 0; c < blob_bottom.channels(); ++c) {
-        for (int h = 0; h < blob_bottom.height(); ++h) {
-          int h_start = h - (size - 1) / 2;
-          int h_end = min(h_start + size, blob_bottom.height());
-          h_start = max(h_start, 0);
-          for (int w = 0; w < blob_bottom.width(); ++w) {
-            Dtype scale = 1.;
-            int w_start = w - (size - 1) / 2;
-            int w_end = min(w_start + size, blob_bottom.width());
-            w_start = max(w_start, 0);
-            for (int nh = h_start; nh < h_end; ++nh) {
-              for (int nw = w_start; nw < w_end; ++nw) {
-                Dtype value = blob_bottom.data_at(n, c, nh, nw);
-                scale += value * value * alpha / (size * size);
-              }
-            }
-            *(top_data + blob_top->offset(n, c, h, w)) =
-              blob_bottom.data_at(n, c, h, w) / pow(scale, beta);
-          }
-        }
-      }
-    }
-    break;
-  default:
-    LOG(FATAL) << "Unknown normalization region.";
-  }
-}
-
-TYPED_TEST_CASE(LRNLayerTest, TestDtypesAndDevices);
-
-TYPED_TEST(LRNLayerTest, TestSetupAcrossChannels) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  LRNLayer<Dtype> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  EXPECT_EQ(this->blob_top_->num(), 2);
-  EXPECT_EQ(this->blob_top_->channels(), 7);
-  EXPECT_EQ(this->blob_top_->height(), 3);
-  EXPECT_EQ(this->blob_top_->width(), 3);
-}
-
-TYPED_TEST(LRNLayerTest, TestForwardAcrossChannels) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  LRNLayer<Dtype> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  Blob<Dtype> top_reference;
-  this->ReferenceLRNForward(*(this->blob_bottom_), layer_param,
-      &top_reference);
-  for (int i = 0; i < this->blob_bottom_->count(); ++i) {
-    EXPECT_NEAR(this->blob_top_->cpu_data()[i], top_reference.cpu_data()[i],
-                this->epsilon_);
-  }
-}
-
-TYPED_TEST(LRNLayerTest, TestForwardAcrossChannelsLargeRegion) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  layer_param.mutable_lrn_param()->set_local_size(15);
-  LRNLayer<Dtype> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  Blob<Dtype> top_reference;
-  this->ReferenceLRNForward(*(this->blob_bottom_), layer_param,
-      &top_reference);
-  for (int i = 0; i < this->blob_bottom_->count(); ++i) {
-    EXPECT_NEAR(this->blob_top_->cpu_data()[i], top_reference.cpu_data()[i],
-                this->epsilon_);
-  }
-}
-
-TYPED_TEST(LRNLayerTest, TestGradientAcrossChannels) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  LRNLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-2, 1e-2);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  for (int i = 0; i < this->blob_top_->count(); ++i) {
-    this->blob_top_->mutable_cpu_diff()[i] = 1.;
-  }
-  vector<bool> propagate_down(this->blob_bottom_vec_.size(), true);
-  layer.Backward(this->blob_top_vec_, propagate_down,
-                 this->blob_bottom_vec_);
-  // for (int i = 0; i < this->blob_bottom_->count(); ++i) {
-  //   std::cout << "CPU diff " << this->blob_bottom_->cpu_diff()[i]
-  //       << std::endl;
-  // }
-  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_);
-}
-
-TYPED_TEST(LRNLayerTest, TestGradientAcrossChannelsLargeRegion) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  layer_param.mutable_lrn_param()->set_local_size(15);
-  LRNLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-2, 1e-2);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  for (int i = 0; i < this->blob_top_->count(); ++i) {
-    this->blob_top_->mutable_cpu_diff()[i] = 1.;
-  }
-  vector<bool> propagate_down(this->blob_bottom_vec_.size(), true);
-  layer.Backward(this->blob_top_vec_, propagate_down,
-                 this->blob_bottom_vec_);
-  // for (int i = 0; i < this->blob_bottom_->count(); ++i) {
-  //   std::cout << "CPU diff " << this->blob_bottom_->cpu_diff()[i]
-  //       << std::endl;
-  // }
-  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_);
-}
-
-TYPED_TEST(LRNLayerTest, TestSetupWithinChannel) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  layer_param.mutable_lrn_param()->set_norm_region(
-      LRNParameter_NormRegion_WITHIN_CHANNEL);
-  layer_param.mutable_lrn_param()->set_local_size(3);
-  LRNLayer<Dtype> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  EXPECT_EQ(this->blob_top_->num(), 2);
-  EXPECT_EQ(this->blob_top_->channels(), 7);
-  EXPECT_EQ(this->blob_top_->height(), 3);
-  EXPECT_EQ(this->blob_top_->width(), 3);
-}
-
-TYPED_TEST(LRNLayerTest, TestForwardWithinChannel) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  layer_param.mutable_lrn_param()->set_norm_region(
-      LRNParameter_NormRegion_WITHIN_CHANNEL);
-  layer_param.mutable_lrn_param()->set_local_size(3);
-  LRNLayer<Dtype> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  Blob<Dtype> top_reference;
-  this->ReferenceLRNForward(*(this->blob_bottom_), layer_param,
-      &top_reference);
-  for (int i = 0; i < this->blob_bottom_->count(); ++i) {
-    EXPECT_NEAR(this->blob_top_->cpu_data()[i], top_reference.cpu_data()[i],
-                this->epsilon_);
-  }
-}
-
-TYPED_TEST(LRNLayerTest, TestGradientWithinChannel) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  layer_param.mutable_lrn_param()->set_norm_region(
-      LRNParameter_NormRegion_WITHIN_CHANNEL);
-  layer_param.mutable_lrn_param()->set_local_size(3);
-  LRNLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-2, 1e-2);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  for (int i = 0; i < this->blob_top_->count(); ++i) {
-    this->blob_top_->mutable_cpu_diff()[i] = 1.;
-  }
-  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_);
-}
-
-#ifdef USE_CUDNN
-template <typename Dtype>
-class CuDNNLRNLayerTest : public GPUDeviceTest<Dtype> {
- protected:
-  CuDNNLRNLayerTest()
-      : epsilon_(Dtype(1e-5)),
-        blob_bottom_(new Blob<Dtype>()),
-        blob_top_(new Blob<Dtype>()) {}
-  virtual void SetUp() {
-    Caffe::set_random_seed(1701);
-    blob_bottom_->Reshape(2, 7, 3, 3);
-    // fill the values
-    FillerParameter filler_param;
-    GaussianFiller<Dtype> filler(filler_param);
-    filler.Fill(this->blob_bottom_);
-    blob_bottom_vec_.push_back(blob_bottom_);
-    blob_top_vec_.push_back(blob_top_);
-  }
-  virtual ~CuDNNLRNLayerTest() { delete blob_bottom_; delete blob_top_; }
-  void ReferenceLRNForward(const Blob<Dtype>& blob_bottom,
-      const LayerParameter& layer_param, Blob<Dtype>* blob_top);
-
-  Dtype epsilon_;
-  Blob<Dtype>* const blob_bottom_;
-  Blob<Dtype>* const blob_top_;
-  vector<Blob<Dtype>*> blob_bottom_vec_;
-  vector<Blob<Dtype>*> blob_top_vec_;
-};
-
-template <typename TypeParam>
-void CuDNNLRNLayerTest<TypeParam>::ReferenceLRNForward(
-    const Blob<TypeParam>& blob_bottom, const LayerParameter& layer_param,
-    Blob<TypeParam>* blob_top) {
-  typedef TypeParam Dtype;
-  blob_top->Reshape(blob_bottom.num(), blob_bottom.channels(),
-      blob_bottom.height(), blob_bottom.width());
-  Dtype* top_data = blob_top->mutable_cpu_data();
-  LRNParameter lrn_param = layer_param.lrn_param();
-  Dtype alpha = lrn_param.alpha();
-  Dtype beta = lrn_param.beta();
-  int size = lrn_param.local_size();
-  switch (lrn_param.norm_region()) {
-  case LRNParameter_NormRegion_ACROSS_CHANNELS:
-    for (int n = 0; n < blob_bottom.num(); ++n) {
-      for (int c = 0; c < blob_bottom.channels(); ++c) {
-        for (int h = 0; h < blob_bottom.height(); ++h) {
-          for (int w = 0; w < blob_bottom.width(); ++w) {
-            int c_start = c - (size - 1) / 2;
-            int c_end = min(c_start + size, blob_bottom.channels());
-            c_start = max(c_start, 0);
-            Dtype scale = 1.;
-            for (int i = c_start; i < c_end; ++i) {
-              Dtype value = blob_bottom.data_at(n, i, h, w);
-              scale += value * value * alpha / size;
-            }
-            *(top_data + blob_top->offset(n, c, h, w)) =
-              blob_bottom.data_at(n, c, h, w) / pow(scale, beta);
-          }
-        }
-      }
-    }
-    break;
-  case LRNParameter_NormRegion_WITHIN_CHANNEL:
-    for (int n = 0; n < blob_bottom.num(); ++n) {
-      for (int c = 0; c < blob_bottom.channels(); ++c) {
-        for (int h = 0; h < blob_bottom.height(); ++h) {
-          int h_start = h - (size - 1) / 2;
-          int h_end = min(h_start + size, blob_bottom.height());
-          h_start = max(h_start, 0);
-          for (int w = 0; w < blob_bottom.width(); ++w) {
-            Dtype scale = 1.;
-            int w_start = w - (size - 1) / 2;
-            int w_end = min(w_start + size, blob_bottom.width());
-            w_start = max(w_start, 0);
-            for (int nh = h_start; nh < h_end; ++nh) {
-              for (int nw = w_start; nw < w_end; ++nw) {
-                Dtype value = blob_bottom.data_at(n, c, nh, nw);
-                scale += value * value * alpha / (size * size);
-              }
-            }
-            *(top_data + blob_top->offset(n, c, h, w)) =
-              blob_bottom.data_at(n, c, h, w) / pow(scale, beta);
-          }
-        }
-      }
-    }
-    break;
-  default:
-    LOG(FATAL) << "Unknown normalization region.";
-  }
-}
-
-TYPED_TEST_CASE(CuDNNLRNLayerTest, TestDtypes);
-
-TYPED_TEST(CuDNNLRNLayerTest, TestForwardAcrossChannelsCuDNN) {
-  // typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  CuDNNLRNLayer<TypeParam> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  Blob<TypeParam> top_reference;
-  this->ReferenceLRNForward(*(this->blob_bottom_), layer_param,
-      &top_reference);
-  for (int i = 0; i < this->blob_bottom_->count(); ++i) {
-    EXPECT_NEAR(this->blob_top_->cpu_data()[i], top_reference.cpu_data()[i],
-                this->epsilon_);
-  }
-}
-
-TYPED_TEST(CuDNNLRNLayerTest, TestForwardAcrossChannelsLargeRegionCuDNN) {
-  typedef TypeParam Dtype;
-  LayerParameter layer_param;
-  layer_param.mutable_lrn_param()->set_local_size(15);
-  CuDNNLRNLayer<Dtype> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  Blob<Dtype> top_reference;
-  this->ReferenceLRNForward(*(this->blob_bottom_), layer_param,
-      &top_reference);
-  for (int i = 0; i < this->blob_bottom_->count(); ++i) {
-    EXPECT_NEAR(this->blob_top_->cpu_data()[i], top_reference.cpu_data()[i],
-                this->epsilon_);
-  }
-}
-
-TYPED_TEST(CuDNNLRNLayerTest, TestGradientAcrossChannelsCuDNN) {
-  typedef TypeParam Dtype;
-  LayerParameter layer_param;
-  CuDNNLRNLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-2, 1e-2);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  for (int i = 0; i < this->blob_top_->count(); ++i) {
-    this->blob_top_->mutable_cpu_diff()[i] = 1.;
-  }
-  vector<bool> propagate_down(this->blob_bottom_vec_.size(), true);
-  layer.Backward(this->blob_top_vec_, propagate_down,
-                 this->blob_bottom_vec_);
-  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_);
-}
-
-TYPED_TEST(CuDNNLRNLayerTest, TestForwardWithinChannel) {
-  typedef TypeParam Dtype;
-  LayerParameter layer_param;
-  layer_param.mutable_lrn_param()->set_norm_region(
-      LRNParameter_NormRegion_WITHIN_CHANNEL);
-  layer_param.mutable_lrn_param()->set_local_size(3);
-  CuDNNLCNLayer<Dtype> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  Blob<Dtype> top_reference;
-  this->ReferenceLRNForward(*(this->blob_bottom_), layer_param,
-      &top_reference);
-  for (int i = 0; i < this->blob_bottom_->count(); ++i) {
-    EXPECT_NEAR(this->blob_top_->cpu_data()[i], top_reference.cpu_data()[i],
-                this->epsilon_);
-  }
-}
-
-TYPED_TEST(CuDNNLRNLayerTest, TestGradientWithinChannel) {
-  typedef TypeParam Dtype;
-  LayerParameter layer_param;
-  layer_param.mutable_lrn_param()->set_norm_region(
-      LRNParameter_NormRegion_WITHIN_CHANNEL);
-  layer_param.mutable_lrn_param()->set_local_size(3);
-  CuDNNLCNLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-2, 1e-2);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  for (int i = 0; i < this->blob_top_->count(); ++i) {
-    this->blob_top_->mutable_cpu_diff()[i] = 1.;
-  }
-  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_);
-}
-
-TYPED_TEST(CuDNNLRNLayerTest, TestGradientAcrossChannelsLargeRegionCuDNN) {
-  typedef TypeParam Dtype;
-  LayerParameter layer_param;
-  layer_param.mutable_lrn_param()->set_local_size(15);
-  CuDNNLRNLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-2, 1e-2);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  for (int i = 0; i < this->blob_top_->count(); ++i) {
-    this->blob_top_->mutable_cpu_diff()[i] = 1.;
-  }
-  vector<bool> propagate_down(this->blob_bottom_vec_.size(), true);
-  layer.Backward(this->blob_top_vec_, propagate_down,
-                 this->blob_bottom_vec_);
-  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_);
-}
-
-#endif
-
-}  // namespace caffe
diff --git a/src/caffe/test/test_lstm_layer.cpp b/src/caffe/test/test_lstm_layer.cpp
deleted file mode 100644
index 51905ba..0000000
--- a/src/caffe/test/test_lstm_layer.cpp
+++ /dev/null
@@ -1,288 +0,0 @@
-#include <cstring>
-#include <vector>
-
-#include "gtest/gtest.h"
-
-#include "caffe/blob.hpp"
-#include "caffe/common.hpp"
-#include "caffe/filler.hpp"
-#include "caffe/layers/lstm_layer.hpp"
-
-#include "caffe/test/test_caffe_main.hpp"
-#include "caffe/test/test_gradient_check_util.hpp"
-
-namespace caffe {
-
-template <typename TypeParam>
-class LSTMLayerTest : public MultiDeviceTest<TypeParam> {
-  typedef typename TypeParam::Dtype Dtype;
-
- protected:
-  LSTMLayerTest() : num_output_(7) {
-    blob_bottom_vec_.push_back(&blob_bottom_);
-    blob_bottom_vec_.push_back(&blob_bottom_cont_);
-    blob_top_vec_.push_back(&blob_top_);
-    unit_blob_bottom_vec_.push_back(&unit_blob_bottom_c_prev_);
-    unit_blob_bottom_vec_.push_back(&unit_blob_bottom_x_);
-    unit_blob_bottom_vec_.push_back(&unit_blob_bottom_cont_);
-    unit_blob_top_vec_.push_back(&unit_blob_top_c_);
-    unit_blob_top_vec_.push_back(&unit_blob_top_h_);
-
-    ReshapeBlobs(1, 3);
-
-    layer_param_.mutable_recurrent_param()->set_num_output(num_output_);
-    FillerParameter* weight_filler =
-        layer_param_.mutable_recurrent_param()->mutable_weight_filler();
-    weight_filler->set_type("gaussian");
-    weight_filler->set_std(0.2);
-    FillerParameter* bias_filler =
-        layer_param_.mutable_recurrent_param()->mutable_bias_filler();
-    bias_filler->set_type("gaussian");
-    bias_filler->set_std(0.1);
-
-    layer_param_.set_phase(TEST);
-  }
-
-  void ReshapeBlobs(int num_timesteps, int num_instances) {
-    blob_bottom_.Reshape(num_timesteps, num_instances, 3, 2);
-    blob_bottom_static_.Reshape(num_instances, 2, 3, 4);
-    vector<int> shape(2);
-    shape[0] = num_timesteps;
-    shape[1] = num_instances;
-    blob_bottom_cont_.Reshape(shape);
-    shape.push_back(num_output_);
-
-    shape[0] = 1; shape[1] = num_instances; shape[2] = 4 * num_output_;
-    unit_blob_bottom_x_.Reshape(shape);
-    shape[0] = 1; shape[1] = num_instances; shape[2] = num_output_;
-    unit_blob_bottom_c_prev_.Reshape(shape);
-    shape.resize(2);
-    shape[0] = 1; shape[1] = num_instances;
-    unit_blob_bottom_cont_.Reshape(shape);
-
-    FillerParameter filler_param;
-    filler_param.set_min(-1);
-    filler_param.set_max(1);
-    UniformFiller<Dtype> filler(filler_param);
-    filler.Fill(&blob_bottom_);
-    filler.Fill(&unit_blob_bottom_c_prev_);
-    filler.Fill(&unit_blob_bottom_x_);
-  }
-
-  int num_output_;
-  LayerParameter layer_param_;
-  Blob<Dtype> blob_bottom_;
-  Blob<Dtype> blob_bottom_cont_;
-  Blob<Dtype> blob_bottom_static_;
-  Blob<Dtype> blob_top_;
-  vector<Blob<Dtype>*> blob_bottom_vec_;
-  vector<Blob<Dtype>*> blob_top_vec_;
-
-  Blob<Dtype> unit_blob_bottom_cont_;
-  Blob<Dtype> unit_blob_bottom_c_prev_;
-  Blob<Dtype> unit_blob_bottom_x_;
-  Blob<Dtype> unit_blob_top_c_;
-  Blob<Dtype> unit_blob_top_h_;
-  vector<Blob<Dtype>*> unit_blob_bottom_vec_;
-  vector<Blob<Dtype>*> unit_blob_top_vec_;
-};
-
-TYPED_TEST_CASE(LSTMLayerTest, TestDtypesAndDevices);
-
-TYPED_TEST(LSTMLayerTest, TestSetUp) {
-  typedef typename TypeParam::Dtype Dtype;
-  LSTMLayer<Dtype> layer(this->layer_param_);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  vector<int> expected_top_shape = this->blob_bottom_.shape();
-  expected_top_shape.resize(3);
-  expected_top_shape[2] = this->num_output_;
-  EXPECT_TRUE(this->blob_top_.shape() == expected_top_shape);
-}
-
-TYPED_TEST(LSTMLayerTest, TestForward) {
-  typedef typename TypeParam::Dtype Dtype;
-  const int kNumTimesteps = 3;
-  const int num = this->blob_bottom_.shape(1);
-  this->ReshapeBlobs(kNumTimesteps, num);
-
-  // Fill the cont blob with <0, 1, 1, ..., 1>,
-  // indicating a sequence that begins at the first timestep
-  // then continues for the rest of the sequence.
-  for (int t = 0; t < kNumTimesteps; ++t) {
-    for (int n = 0; n < num; ++n) {
-      this->blob_bottom_cont_.mutable_cpu_data()[t * num + n] = t > 0;
-    }
-  }
-
-  // Process the full sequence in a single batch.
-  FillerParameter filler_param;
-  filler_param.set_mean(0);
-  filler_param.set_std(1);
-  GaussianFiller<Dtype> sequence_filler(filler_param);
-  Caffe::set_random_seed(1);
-  sequence_filler.Fill(&this->blob_bottom_);
-  shared_ptr<LSTMLayer<Dtype> > layer(new LSTMLayer<Dtype>(this->layer_param_));
-  Caffe::set_random_seed(1701);
-  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  LOG(INFO) << "Calling forward for full sequence LSTM";
-  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-
-  // Copy the inputs and outputs to reuse/check them later.
-  Blob<Dtype> bottom_copy(this->blob_bottom_.shape());
-  bottom_copy.CopyFrom(this->blob_bottom_);
-  Blob<Dtype> top_copy(this->blob_top_.shape());
-  top_copy.CopyFrom(this->blob_top_);
-
-  // Process the batch one timestep at a time;
-  // check that we get the same result.
-  this->ReshapeBlobs(1, num);
-  layer.reset(new LSTMLayer<Dtype>(this->layer_param_));
-  Caffe::set_random_seed(1701);
-  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  const int bottom_count = this->blob_bottom_.count();
-  const int top_count = this->blob_top_.count();
-  const Dtype kEpsilon = 1e-5;
-  for (int t = 0; t < kNumTimesteps; ++t) {
-    caffe_copy(bottom_count, bottom_copy.cpu_data() + t * bottom_count,
-               this->blob_bottom_.mutable_cpu_data());
-    for (int n = 0; n < num; ++n) {
-      this->blob_bottom_cont_.mutable_cpu_data()[n] = t > 0;
-    }
-    LOG(INFO) << "Calling forward for LSTM timestep " << t;
-    layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-    for (int i = 0; i < top_count; ++i) {
-      ASSERT_LT(t * top_count + i, top_copy.count());
-      EXPECT_NEAR(this->blob_top_.cpu_data()[i],
-                  top_copy.cpu_data()[t * top_count + i], kEpsilon)
-         << "t = " << t << "; i = " << i;
-    }
-  }
-
-  // Process the batch one timestep at a time with all cont blobs set to 0.
-  // Check that we get a different result, except in the first timestep.
-  Caffe::set_random_seed(1701);
-  layer.reset(new LSTMLayer<Dtype>(this->layer_param_));
-  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  for (int t = 0; t < kNumTimesteps; ++t) {
-    caffe_copy(bottom_count, bottom_copy.cpu_data() + t * bottom_count,
-               this->blob_bottom_.mutable_cpu_data());
-    for (int n = 0; n < num; ++n) {
-      this->blob_bottom_cont_.mutable_cpu_data()[n] = 0;
-    }
-    LOG(INFO) << "Calling forward for LSTM timestep " << t;
-    layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-    for (int i = 0; i < top_count; ++i) {
-      if (t == 0) {
-        EXPECT_NEAR(this->blob_top_.cpu_data()[i],
-                    top_copy.cpu_data()[t * top_count + i], kEpsilon)
-           << "t = " << t << "; i = " << i;
-      } else {
-        EXPECT_NE(this->blob_top_.cpu_data()[i],
-                  top_copy.cpu_data()[t * top_count + i])
-           << "t = " << t << "; i = " << i;
-      }
-    }
-  }
-}
-
-TYPED_TEST(LSTMLayerTest, TestLSTMUnitSetUp) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  LSTMUnitLayer<Dtype> layer(layer_param);
-  layer.SetUp(this->unit_blob_bottom_vec_, this->unit_blob_top_vec_);
-  const int num_axes = this->unit_blob_bottom_c_prev_.num_axes();
-  ASSERT_EQ(num_axes, this->unit_blob_top_c_.num_axes());
-  ASSERT_EQ(num_axes, this->unit_blob_top_h_.num_axes());
-  for (int i = 0; i < num_axes; ++i) {
-    EXPECT_EQ(this->unit_blob_bottom_c_prev_.shape(i),
-              this->unit_blob_top_c_.shape(i));
-    EXPECT_EQ(this->unit_blob_bottom_c_prev_.shape(i),
-              this->unit_blob_top_h_.shape(i));
-  }
-}
-
-TYPED_TEST(LSTMLayerTest, TestLSTMUnitGradient) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  LSTMUnitLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-2, 1e-3);
-  Dtype* cont_data = this->blob_bottom_cont_.mutable_cpu_data();
-  cont_data[0] = 0;
-  cont_data[1] = 0;
-  cont_data[2] = 0;
-  checker.CheckGradientExhaustive(&layer, this->unit_blob_bottom_vec_,
-      this->unit_blob_top_vec_, 0);
-  checker.CheckGradientExhaustive(&layer, this->unit_blob_bottom_vec_,
-      this->unit_blob_top_vec_, 1);
-}
-
-TYPED_TEST(LSTMLayerTest, TestLSTMUnitGradientNonZeroCont) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  LSTMUnitLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-2, 1e-3);
-  Dtype* cont_data = this->blob_bottom_cont_.mutable_cpu_data();
-  cont_data[0] = 1;
-  cont_data[1] = 0;
-  cont_data[2] = 1;
-  checker.CheckGradientExhaustive(&layer, this->unit_blob_bottom_vec_,
-      this->unit_blob_top_vec_, 0);
-  checker.CheckGradientExhaustive(&layer, this->unit_blob_bottom_vec_,
-      this->unit_blob_top_vec_, 1);
-}
-
-TYPED_TEST(LSTMLayerTest, TestGradient) {
-  typedef typename TypeParam::Dtype Dtype;
-  LSTMLayer<Dtype> layer(this->layer_param_);
-  GradientChecker<Dtype> checker(1e-2, 1e-3);
-  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_, 0);
-}
-
-TYPED_TEST(LSTMLayerTest, TestGradientNonZeroCont) {
-  typedef typename TypeParam::Dtype Dtype;
-  LSTMLayer<Dtype> layer(this->layer_param_);
-  GradientChecker<Dtype> checker(1e-2, 1e-3);
-  for (int i = 0; i < this->blob_bottom_cont_.count(); ++i) {
-    this->blob_bottom_cont_.mutable_cpu_data()[i] = i > 2;
-  }
-  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_, 0);
-}
-
-TYPED_TEST(LSTMLayerTest, TestGradientNonZeroContBufferSize2) {
-  typedef typename TypeParam::Dtype Dtype;
-  this->ReshapeBlobs(2, 2);
-  FillerParameter filler_param;
-  UniformFiller<Dtype> filler(filler_param);
-  filler.Fill(&this->blob_bottom_);
-  LSTMLayer<Dtype> layer(this->layer_param_);
-  GradientChecker<Dtype> checker(1e-2, 1e-3);
-  for (int i = 0; i < this->blob_bottom_cont_.count(); ++i) {
-    this->blob_bottom_cont_.mutable_cpu_data()[i] = i > 2;
-  }
-  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_, 0);
-}
-
-TYPED_TEST(LSTMLayerTest, TestGradientNonZeroContBufferSize2WithStaticInput) {
-  typedef typename TypeParam::Dtype Dtype;
-  this->ReshapeBlobs(2, 2);
-  FillerParameter filler_param;
-  UniformFiller<Dtype> filler(filler_param);
-  filler.Fill(&this->blob_bottom_);
-  filler.Fill(&this->blob_bottom_static_);
-  this->blob_bottom_vec_.push_back(&this->blob_bottom_static_);
-  LSTMLayer<Dtype> layer(this->layer_param_);
-  GradientChecker<Dtype> checker(1e-2, 1e-3);
-  for (int i = 0; i < this->blob_bottom_cont_.count(); ++i) {
-    this->blob_bottom_cont_.mutable_cpu_data()[i] = i > 2;
-  }
-  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_, 0);
-  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_, 2);
-}
-
-
-}  // namespace caffe
diff --git a/src/caffe/test/test_math_functions.cpp b/src/caffe/test/test_math_functions.cpp
deleted file mode 100644
index efc5a27..0000000
--- a/src/caffe/test/test_math_functions.cpp
+++ /dev/null
@@ -1,203 +0,0 @@
-#include <stdint.h>  // for uint32_t & uint64_t
-#include <time.h>
-#include <cmath>  // for std::fabs
-
-#include "gtest/gtest.h"
-
-#include "caffe/blob.hpp"
-#include "caffe/common.hpp"
-#include "caffe/filler.hpp"
-#include "caffe/util/math_functions.hpp"
-
-#include "caffe/test/test_caffe_main.hpp"
-
-namespace caffe {
-
-template <typename TypeParam>
-class MathFunctionsTest : public MultiDeviceTest<TypeParam> {
-  typedef typename TypeParam::Dtype Dtype;
-
- protected:
-  MathFunctionsTest()
-      : blob_bottom_(new Blob<Dtype>()),
-        blob_top_(new Blob<Dtype>()) {
-  }
-
-  virtual void SetUp() {
-    Caffe::set_random_seed(1701);
-    this->blob_bottom_->Reshape(11, 17, 19, 23);
-    this->blob_top_->Reshape(11, 17, 19, 23);
-    // fill the values
-    FillerParameter filler_param;
-    GaussianFiller<Dtype> filler(filler_param);
-    filler.Fill(this->blob_bottom_);
-    filler.Fill(this->blob_top_);
-  }
-
-  virtual ~MathFunctionsTest() {
-    delete blob_bottom_;
-    delete blob_top_;
-  }
-
-  Blob<Dtype>* const blob_bottom_;
-  Blob<Dtype>* const blob_top_;
-};
-
-template <typename Dtype>
-class CPUMathFunctionsTest
-  : public MathFunctionsTest<CPUDevice<Dtype> > {
-};
-
-TYPED_TEST_CASE(CPUMathFunctionsTest, TestDtypes);
-
-TYPED_TEST(CPUMathFunctionsTest, TestNothing) {
-  // The first test case of a test suite takes the longest time
-  //   due to the set up overhead.
-}
-
-TYPED_TEST(CPUMathFunctionsTest, TestAsum) {
-  int n = this->blob_bottom_->count();
-  const TypeParam* x = this->blob_bottom_->cpu_data();
-  TypeParam std_asum = 0;
-  for (int i = 0; i < n; ++i) {
-    std_asum += std::fabs(x[i]);
-  }
-  TypeParam cpu_asum = caffe_cpu_asum<TypeParam>(n, x);
-  EXPECT_LT((cpu_asum - std_asum) / std_asum, 1e-2);
-}
-
-TYPED_TEST(CPUMathFunctionsTest, TestSign) {
-  int n = this->blob_bottom_->count();
-  const TypeParam* x = this->blob_bottom_->cpu_data();
-  caffe_cpu_sign<TypeParam>(n, x, this->blob_bottom_->mutable_cpu_diff());
-  const TypeParam* signs = this->blob_bottom_->cpu_diff();
-  for (int i = 0; i < n; ++i) {
-    EXPECT_EQ(signs[i], x[i] > 0 ? 1 : (x[i] < 0 ? -1 : 0));
-  }
-}
-
-TYPED_TEST(CPUMathFunctionsTest, TestSgnbit) {
-  int n = this->blob_bottom_->count();
-  const TypeParam* x = this->blob_bottom_->cpu_data();
-  caffe_cpu_sgnbit<TypeParam>(n, x, this->blob_bottom_->mutable_cpu_diff());
-  const TypeParam* signbits = this->blob_bottom_->cpu_diff();
-  for (int i = 0; i < n; ++i) {
-    EXPECT_EQ(signbits[i], x[i] < 0 ? 1 : 0);
-  }
-}
-
-TYPED_TEST(CPUMathFunctionsTest, TestFabs) {
-  int n = this->blob_bottom_->count();
-  const TypeParam* x = this->blob_bottom_->cpu_data();
-  caffe_abs<TypeParam>(n, x, this->blob_bottom_->mutable_cpu_diff());
-  const TypeParam* abs_val = this->blob_bottom_->cpu_diff();
-  for (int i = 0; i < n; ++i) {
-    EXPECT_EQ(abs_val[i], x[i] > 0 ? x[i] : -x[i]);
-  }
-}
-
-TYPED_TEST(CPUMathFunctionsTest, TestScale) {
-  int n = this->blob_bottom_->count();
-  TypeParam alpha = this->blob_bottom_->cpu_diff()[caffe_rng_rand() %
-                                                   this->blob_bottom_->count()];
-  caffe_cpu_scale<TypeParam>(n, alpha, this->blob_bottom_->cpu_data(),
-                             this->blob_bottom_->mutable_cpu_diff());
-  const TypeParam* scaled = this->blob_bottom_->cpu_diff();
-  const TypeParam* x = this->blob_bottom_->cpu_data();
-  for (int i = 0; i < n; ++i) {
-    EXPECT_EQ(scaled[i], x[i] * alpha);
-  }
-}
-
-TYPED_TEST(CPUMathFunctionsTest, TestCopy) {
-  const int n = this->blob_bottom_->count();
-  const TypeParam* bottom_data = this->blob_bottom_->cpu_data();
-  TypeParam* top_data = this->blob_top_->mutable_cpu_data();
-  caffe_copy(n, bottom_data, top_data);
-  for (int i = 0; i < n; ++i) {
-    EXPECT_EQ(bottom_data[i], top_data[i]);
-  }
-}
-
-#ifndef CPU_ONLY
-
-template <typename Dtype>
-class GPUMathFunctionsTest : public MathFunctionsTest<GPUDevice<Dtype> > {
-};
-
-TYPED_TEST_CASE(GPUMathFunctionsTest, TestDtypes);
-
-TYPED_TEST(GPUMathFunctionsTest, TestAsum) {
-  int n = this->blob_bottom_->count();
-  const TypeParam* x = this->blob_bottom_->cpu_data();
-  TypeParam std_asum = 0;
-  for (int i = 0; i < n; ++i) {
-    std_asum += std::fabs(x[i]);
-  }
-  TypeParam gpu_asum;
-  caffe_gpu_asum<TypeParam>(n, this->blob_bottom_->gpu_data(), &gpu_asum);
-  EXPECT_LT((gpu_asum - std_asum) / std_asum, 1e-2);
-}
-
-TYPED_TEST(GPUMathFunctionsTest, TestSign) {
-  int n = this->blob_bottom_->count();
-  caffe_gpu_sign<TypeParam>(n, this->blob_bottom_->gpu_data(),
-                            this->blob_bottom_->mutable_gpu_diff());
-  const TypeParam* signs = this->blob_bottom_->cpu_diff();
-  const TypeParam* x = this->blob_bottom_->cpu_data();
-  for (int i = 0; i < n; ++i) {
-    EXPECT_EQ(signs[i], x[i] > 0 ? 1 : (x[i] < 0 ? -1 : 0));
-  }
-}
-
-TYPED_TEST(GPUMathFunctionsTest, TestSgnbit) {
-  int n = this->blob_bottom_->count();
-  caffe_gpu_sgnbit<TypeParam>(n, this->blob_bottom_->gpu_data(),
-                            this->blob_bottom_->mutable_gpu_diff());
-  const TypeParam* signbits = this->blob_bottom_->cpu_diff();
-  const TypeParam* x = this->blob_bottom_->cpu_data();
-  for (int i = 0; i < n; ++i) {
-    EXPECT_EQ(signbits[i], x[i] < 0 ? 1 : 0);
-  }
-}
-
-TYPED_TEST(GPUMathFunctionsTest, TestFabs) {
-  int n = this->blob_bottom_->count();
-  caffe_gpu_abs<TypeParam>(n, this->blob_bottom_->gpu_data(),
-                            this->blob_bottom_->mutable_gpu_diff());
-  const TypeParam* abs_val = this->blob_bottom_->cpu_diff();
-  const TypeParam* x = this->blob_bottom_->cpu_data();
-  for (int i = 0; i < n; ++i) {
-    EXPECT_EQ(abs_val[i], x[i] > 0 ? x[i] : -x[i]);
-  }
-}
-
-TYPED_TEST(GPUMathFunctionsTest, TestScale) {
-  int n = this->blob_bottom_->count();
-  TypeParam alpha = this->blob_bottom_->cpu_diff()[caffe_rng_rand() %
-                                                   this->blob_bottom_->count()];
-  caffe_gpu_scale<TypeParam>(n, alpha, this->blob_bottom_->gpu_data(),
-                             this->blob_bottom_->mutable_gpu_diff());
-  const TypeParam* scaled = this->blob_bottom_->cpu_diff();
-  const TypeParam* x = this->blob_bottom_->cpu_data();
-  for (int i = 0; i < n; ++i) {
-    EXPECT_EQ(scaled[i], x[i] * alpha);
-  }
-}
-
-TYPED_TEST(GPUMathFunctionsTest, TestCopy) {
-  const int n = this->blob_bottom_->count();
-  const TypeParam* bottom_data = this->blob_bottom_->gpu_data();
-  TypeParam* top_data = this->blob_top_->mutable_gpu_data();
-  caffe_copy(n, bottom_data, top_data);
-  bottom_data = this->blob_bottom_->cpu_data();
-  top_data = this->blob_top_->mutable_cpu_data();
-  for (int i = 0; i < n; ++i) {
-    EXPECT_EQ(bottom_data[i], top_data[i]);
-  }
-}
-
-#endif
-
-
-}  // namespace caffe
diff --git a/src/caffe/test/test_maxpool_dropout_layers.cpp b/src/caffe/test/test_maxpool_dropout_layers.cpp
deleted file mode 100644
index 4f0e20a..0000000
--- a/src/caffe/test/test_maxpool_dropout_layers.cpp
+++ /dev/null
@@ -1,127 +0,0 @@
-#include <vector>
-
-#include "gtest/gtest.h"
-
-#include "caffe/blob.hpp"
-#include "caffe/common.hpp"
-#include "caffe/filler.hpp"
-#include "caffe/layers/dropout_layer.hpp"
-#include "caffe/layers/pooling_layer.hpp"
-
-#include "caffe/test/test_caffe_main.hpp"
-#include "caffe/test/test_gradient_check_util.hpp"
-
-namespace caffe {
-
-template <typename TypeParam>
-class MaxPoolingDropoutTest : public MultiDeviceTest<TypeParam> {
-  typedef typename TypeParam::Dtype Dtype;
- protected:
-  MaxPoolingDropoutTest()
-      : blob_bottom_(new Blob<Dtype>()),
-        blob_top_(new Blob<Dtype>()) {}
-  virtual void SetUp() {
-    Caffe::set_random_seed(1703);
-    blob_bottom_->Reshape(2, 3, 6, 5);
-    // fill the values
-    FillerParameter filler_param;
-    filler_param.set_value(1.);
-    ConstantFiller<Dtype> filler(filler_param);
-    filler.Fill(this->blob_bottom_);
-    blob_bottom_vec_.push_back(blob_bottom_);
-    blob_top_vec_.push_back(blob_top_);
-  }
-  virtual ~MaxPoolingDropoutTest() { delete blob_bottom_; delete blob_top_; }
-  Blob<Dtype>* const blob_bottom_;
-  Blob<Dtype>* const blob_top_;
-  vector<Blob<Dtype>*> blob_bottom_vec_;
-  vector<Blob<Dtype>*> blob_top_vec_;
-};
-
-TYPED_TEST_CASE(MaxPoolingDropoutTest, TestDtypesAndDevices);
-
-TYPED_TEST(MaxPoolingDropoutTest, TestSetup) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
-  pooling_param->set_kernel_size(3);
-  pooling_param->set_stride(2);
-  PoolingLayer<Dtype> max_layer(layer_param);
-  max_layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  DropoutLayer<Dtype> dropout_layer(layer_param);
-  dropout_layer.SetUp(this->blob_top_vec_, this->blob_top_vec_);
-  EXPECT_EQ(this->blob_top_->num(), this->blob_bottom_->num());
-  EXPECT_EQ(this->blob_top_->channels(), this->blob_bottom_->channels());
-  EXPECT_EQ(this->blob_top_->height(), 3);
-  EXPECT_EQ(this->blob_top_->width(), 2);
-}
-
-
-TYPED_TEST(MaxPoolingDropoutTest, TestForward) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
-  pooling_param->set_kernel_size(3);
-  pooling_param->set_stride(2);
-  PoolingLayer<Dtype> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  const Dtype* top_data = this->blob_top_->cpu_data();
-  Dtype sum = 0.;
-  for (int i = 0; i < this->blob_top_->count(); ++i) {
-    sum += top_data[i];
-  }
-  EXPECT_EQ(sum, this->blob_top_->count());
-  // Dropout in-place
-  DropoutLayer<Dtype> dropout_layer(layer_param);
-  dropout_layer.SetUp(this->blob_top_vec_, this->blob_top_vec_);
-  dropout_layer.Forward(this->blob_top_vec_, this->blob_top_vec_);
-  sum = 0.;
-  Dtype scale = 1. / (1. - layer_param.dropout_param().dropout_ratio());
-  top_data = this->blob_top_->cpu_data();
-  for (int i = 0; i < this->blob_top_->count(); ++i) {
-    sum += top_data[i];
-  }
-  EXPECT_GE(sum, 0);
-  EXPECT_LE(sum, this->blob_top_->count()*scale);
-}
-
-TYPED_TEST(MaxPoolingDropoutTest, TestBackward) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  layer_param.set_phase(TRAIN);
-  PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
-  pooling_param->set_kernel_size(3);
-  pooling_param->set_stride(2);
-  PoolingLayer<Dtype> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  for (int i = 0; i < this->blob_top_->count(); ++i) {
-    this->blob_top_->mutable_cpu_diff()[i] = 1.;
-  }
-  vector<bool> propagate_down(this->blob_bottom_vec_.size(), true);
-  layer.Backward(this->blob_top_vec_, propagate_down,
-                 this->blob_bottom_vec_);
-  const Dtype* bottom_diff = this->blob_bottom_->cpu_diff();
-  Dtype sum = 0.;
-  for (int i = 0; i < this->blob_bottom_->count(); ++i) {
-    sum += bottom_diff[i];
-  }
-  EXPECT_EQ(sum, this->blob_top_->count());
-  // Dropout in-place
-  DropoutLayer<Dtype> dropout_layer(layer_param);
-  dropout_layer.SetUp(this->blob_top_vec_, this->blob_top_vec_);
-  dropout_layer.Forward(this->blob_top_vec_, this->blob_top_vec_);
-  dropout_layer.Backward(this->blob_top_vec_, propagate_down,
-                         this->blob_top_vec_);
-  layer.Backward(this->blob_top_vec_, propagate_down,
-                 this->blob_bottom_vec_);
-  Dtype sum_with_dropout = 0.;
-  bottom_diff = this->blob_bottom_->cpu_diff();
-  for (int i = 0; i < this->blob_bottom_->count(); ++i) {
-    sum_with_dropout += bottom_diff[i];
-  }
-  EXPECT_GE(sum_with_dropout, sum);
-}
-
-}  // namespace caffe
diff --git a/src/caffe/test/test_memory_data_layer.cpp b/src/caffe/test/test_memory_data_layer.cpp
deleted file mode 100644
index 7998bc1..0000000
--- a/src/caffe/test/test_memory_data_layer.cpp
+++ /dev/null
@@ -1,299 +0,0 @@
-#ifdef USE_OPENCV
-#include <opencv2/core/core.hpp>
-#endif  // USE_OPENCV
-
-#include <string>
-#include <vector>
-
-#include "caffe/filler.hpp"
-#include "caffe/layers/memory_data_layer.hpp"
-
-#include "caffe/test/test_caffe_main.hpp"
-
-namespace caffe {
-
-template <typename TypeParam>
-class MemoryDataLayerTest : public MultiDeviceTest<TypeParam> {
-  typedef typename TypeParam::Dtype Dtype;
-
- protected:
-  MemoryDataLayerTest()
-    : data_(new Blob<Dtype>()),
-      labels_(new Blob<Dtype>()),
-      data_blob_(new Blob<Dtype>()),
-      label_blob_(new Blob<Dtype>()) {}
-  virtual void SetUp() {
-    batch_size_ = 8;
-    batches_ = 12;
-    channels_ = 4;
-    height_ = 7;
-    width_ = 11;
-    blob_top_vec_.push_back(data_blob_);
-    blob_top_vec_.push_back(label_blob_);
-    // pick random input data
-    FillerParameter filler_param;
-    GaussianFiller<Dtype> filler(filler_param);
-    data_->Reshape(batches_ * batch_size_, channels_, height_, width_);
-    labels_->Reshape(batches_ * batch_size_, 1, 1, 1);
-    filler.Fill(this->data_);
-    filler.Fill(this->labels_);
-  }
-
-  virtual ~MemoryDataLayerTest() {
-    delete data_blob_;
-    delete label_blob_;
-    delete data_;
-    delete labels_;
-  }
-  int batch_size_;
-  int batches_;
-  int channels_;
-  int height_;
-  int width_;
-  // we don't really need blobs for the input data, but it makes it
-  //  easier to call Filler
-  Blob<Dtype>* const data_;
-  Blob<Dtype>* const labels_;
-  // blobs for the top of MemoryDataLayer
-  Blob<Dtype>* const data_blob_;
-  Blob<Dtype>* const label_blob_;
-  vector<Blob<Dtype>*> blob_bottom_vec_;
-  vector<Blob<Dtype>*> blob_top_vec_;
-};
-
-TYPED_TEST_CASE(MemoryDataLayerTest, TestDtypesAndDevices);
-
-TYPED_TEST(MemoryDataLayerTest, TestSetup) {
-  typedef typename TypeParam::Dtype Dtype;
-
-  LayerParameter layer_param;
-  MemoryDataParameter* md_param = layer_param.mutable_memory_data_param();
-  md_param->set_batch_size(this->batch_size_);
-  md_param->set_channels(this->channels_);
-  md_param->set_height(this->height_);
-  md_param->set_width(this->width_);
-  shared_ptr<Layer<Dtype> > layer(
-      new MemoryDataLayer<Dtype>(layer_param));
-  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  EXPECT_EQ(this->data_blob_->num(), this->batch_size_);
-  EXPECT_EQ(this->data_blob_->channels(), this->channels_);
-  EXPECT_EQ(this->data_blob_->height(), this->height_);
-  EXPECT_EQ(this->data_blob_->width(), this->width_);
-  EXPECT_EQ(this->label_blob_->num(), this->batch_size_);
-  EXPECT_EQ(this->label_blob_->channels(), 1);
-  EXPECT_EQ(this->label_blob_->height(), 1);
-  EXPECT_EQ(this->label_blob_->width(), 1);
-}
-
-// run through a few batches and check that the right data appears
-TYPED_TEST(MemoryDataLayerTest, TestForward) {
-  typedef typename TypeParam::Dtype Dtype;
-
-  LayerParameter layer_param;
-  MemoryDataParameter* md_param = layer_param.mutable_memory_data_param();
-  md_param->set_batch_size(this->batch_size_);
-  md_param->set_channels(this->channels_);
-  md_param->set_height(this->height_);
-  md_param->set_width(this->width_);
-  shared_ptr<MemoryDataLayer<Dtype> > layer(
-      new MemoryDataLayer<Dtype>(layer_param));
-  layer->DataLayerSetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer->Reset(this->data_->mutable_cpu_data(),
-      this->labels_->mutable_cpu_data(), this->data_->num());
-  for (int i = 0; i < this->batches_ * 6; ++i) {
-    int batch_num = i % this->batches_;
-    layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-    for (int j = 0; j < this->data_blob_->count(); ++j) {
-      EXPECT_EQ(this->data_blob_->cpu_data()[j],
-          this->data_->cpu_data()[
-              this->data_->offset(1) * this->batch_size_ * batch_num + j]);
-    }
-    for (int j = 0; j < this->label_blob_->count(); ++j) {
-      EXPECT_EQ(this->label_blob_->cpu_data()[j],
-          this->labels_->cpu_data()[this->batch_size_ * batch_num + j]);
-    }
-  }
-}
-
-#ifdef USE_OPENCV
-TYPED_TEST(MemoryDataLayerTest, AddDatumVectorDefaultTransform) {
-  typedef typename TypeParam::Dtype Dtype;
-
-  LayerParameter param;
-  MemoryDataParameter* memory_data_param = param.mutable_memory_data_param();
-  memory_data_param->set_batch_size(this->batch_size_);
-  memory_data_param->set_channels(this->channels_);
-  memory_data_param->set_height(this->height_);
-  memory_data_param->set_width(this->width_);
-  MemoryDataLayer<Dtype> layer(param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  // We add batch_size*num_iter items, then for each iteration
-  // we forward batch_size elements
-  int num_iter = 5;
-  vector<Datum> datum_vector(this->batch_size_ * num_iter);
-  const size_t count = this->channels_ * this->height_ * this->width_;
-  size_t pixel_index = 0;
-  for (int i = 0; i < this->batch_size_ * num_iter; ++i) {
-    datum_vector[i].set_channels(this->channels_);
-    datum_vector[i].set_height(this->height_);
-    datum_vector[i].set_width(this->width_);
-    datum_vector[i].set_label(i);
-    vector<char> pixels(count);
-    for (int j = 0; j < count; ++j) {
-      pixels[j] = pixel_index++ % 256;
-    }
-    datum_vector[i].set_data(&(pixels[0]), count);
-  }
-  layer.AddDatumVector(datum_vector);
-
-  int data_index;
-  // Go through the data 5 times
-  for (int iter = 0; iter < num_iter; ++iter) {
-    int offset = this->batch_size_ * iter;
-    layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-    const Dtype* data = this->data_blob_->cpu_data();
-    size_t index = 0;
-    for (int i = 0; i < this->batch_size_; ++i) {
-      const string& data_string = datum_vector[offset + i].data();
-      EXPECT_EQ(offset + i, this->label_blob_->cpu_data()[i]);
-      for (int c = 0; c < this->channels_; ++c) {
-        for (int h = 0; h < this->height_; ++h) {
-          for (int w = 0; w < this->width_; ++w) {
-            data_index = (c * this->height_ + h) * this->width_ + w;
-            EXPECT_EQ(static_cast<Dtype>(
-                static_cast<uint8_t>(data_string[data_index])),
-                      data[index++]);
-          }
-        }
-      }
-    }
-  }
-}
-
-TYPED_TEST(MemoryDataLayerTest, AddMatVectorDefaultTransform) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter param;
-  MemoryDataParameter* memory_data_param = param.mutable_memory_data_param();
-  memory_data_param->set_batch_size(this->batch_size_);
-  memory_data_param->set_channels(this->channels_);
-  memory_data_param->set_height(this->height_);
-  memory_data_param->set_width(this->width_);
-  MemoryDataLayer<Dtype> layer(param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  // We add batch_size*num_iter items, then for each iteration
-  // we forward batch_size elements
-  int num_iter = 5;
-  vector<cv::Mat> mat_vector(this->batch_size_ * num_iter);
-  vector<int> label_vector(this->batch_size_ * num_iter);
-  for (int i = 0; i < this->batch_size_*num_iter; ++i) {
-    mat_vector[i] = cv::Mat(this->height_, this->width_, CV_8UC4);
-    label_vector[i] = i;
-    cv::randu(mat_vector[i], cv::Scalar::all(0), cv::Scalar::all(255));
-  }
-  layer.AddMatVector(mat_vector, label_vector);
-
-  int data_index;
-  const size_t count = this->channels_ * this->height_ * this->width_;
-  for (int iter = 0; iter < num_iter; ++iter) {
-    int offset = this->batch_size_ * iter;
-    layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-    const Dtype* data = this->data_blob_->cpu_data();
-    for (int i = 0; i < this->batch_size_; ++i) {
-      EXPECT_EQ(offset + i, this->label_blob_->cpu_data()[i]);
-      for (int h = 0; h < this->height_; ++h) {
-        const unsigned char* ptr_mat = mat_vector[offset + i].ptr<uchar>(h);
-        int index = 0;
-        for (int w = 0; w < this->width_; ++w) {
-          for (int c = 0; c < this->channels_; ++c) {
-            data_index = (i*count) + (c * this->height_ + h) * this->width_ + w;
-            Dtype pixel = static_cast<Dtype>(ptr_mat[index++]);
-            EXPECT_EQ(static_cast<int>(pixel),
-                      data[data_index]);
-          }
-        }
-      }
-    }
-  }
-}
-
-TYPED_TEST(MemoryDataLayerTest, TestSetBatchSize) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter param;
-  MemoryDataParameter* memory_data_param = param.mutable_memory_data_param();
-  memory_data_param->set_batch_size(this->batch_size_);
-  memory_data_param->set_channels(this->channels_);
-  memory_data_param->set_height(this->height_);
-  memory_data_param->set_width(this->width_);
-  MemoryDataLayer<Dtype> layer(param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  // first add data as usual
-  int num_iter = 5;
-  vector<cv::Mat> mat_vector(this->batch_size_ * num_iter);
-  vector<int> label_vector(this->batch_size_ * num_iter);
-  for (int i = 0; i < this->batch_size_*num_iter; ++i) {
-    mat_vector[i] = cv::Mat(this->height_, this->width_, CV_8UC4);
-    label_vector[i] = i;
-    cv::randu(mat_vector[i], cv::Scalar::all(0), cv::Scalar::all(255));
-  }
-  layer.AddMatVector(mat_vector, label_vector);
-  // then consume the data
-  int data_index;
-  const size_t count = this->channels_ * this->height_ * this->width_;
-  for (int iter = 0; iter < num_iter; ++iter) {
-    int offset = this->batch_size_ * iter;
-    layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-    const Dtype* data = this->data_blob_->cpu_data();
-    for (int i = 0; i < this->batch_size_; ++i) {
-      EXPECT_EQ(offset + i, this->label_blob_->cpu_data()[i]);
-      for (int h = 0; h < this->height_; ++h) {
-        const unsigned char* ptr_mat = mat_vector[offset + i].ptr<uchar>(h);
-        int index = 0;
-        for (int w = 0; w < this->width_; ++w) {
-          for (int c = 0; c < this->channels_; ++c) {
-            data_index = (i*count) + (c * this->height_ + h) * this->width_ + w;
-            Dtype pixel = static_cast<Dtype>(ptr_mat[index++]);
-            EXPECT_EQ(static_cast<int>(pixel), data[data_index]);
-          }
-        }
-      }
-    }
-  }
-  // and then add new data with different batch_size
-  int new_batch_size = 16;
-  layer.set_batch_size(new_batch_size);
-  mat_vector.clear();
-  mat_vector.resize(new_batch_size * num_iter);
-  label_vector.clear();
-  label_vector.resize(new_batch_size * num_iter);
-  for (int i = 0; i < new_batch_size*num_iter; ++i) {
-    mat_vector[i] = cv::Mat(this->height_, this->width_, CV_8UC4);
-    label_vector[i] = i;
-    cv::randu(mat_vector[i], cv::Scalar::all(0), cv::Scalar::all(255));
-  }
-  layer.AddMatVector(mat_vector, label_vector);
-
-  // finally consume new data and check if everything is fine
-  for (int iter = 0; iter < num_iter; ++iter) {
-    int offset = new_batch_size * iter;
-    layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-    EXPECT_EQ(new_batch_size, this->blob_top_vec_[0]->num());
-    EXPECT_EQ(new_batch_size, this->blob_top_vec_[1]->num());
-    const Dtype* data = this->data_blob_->cpu_data();
-    for (int i = 0; i < new_batch_size; ++i) {
-      EXPECT_EQ(offset + i, this->label_blob_->cpu_data()[i]);
-      for (int h = 0; h < this->height_; ++h) {
-        const unsigned char* ptr_mat = mat_vector[offset + i].ptr<uchar>(h);
-        int index = 0;
-        for (int w = 0; w < this->width_; ++w) {
-          for (int c = 0; c < this->channels_; ++c) {
-            data_index = (i*count) + (c * this->height_ + h) * this->width_ + w;
-            Dtype pixel = static_cast<Dtype>(ptr_mat[index++]);
-            EXPECT_EQ(static_cast<int>(pixel), data[data_index]);
-          }
-        }
-      }
-    }
-  }
-}
-#endif  // USE_OPENCV
-}  // namespace caffe
diff --git a/src/caffe/test/test_multinomial_logistic_loss_layer.cpp b/src/caffe/test/test_multinomial_logistic_loss_layer.cpp
deleted file mode 100644
index 8cc2102..0000000
--- a/src/caffe/test/test_multinomial_logistic_loss_layer.cpp
+++ /dev/null
@@ -1,58 +0,0 @@
-#include <vector>
-
-#include "gtest/gtest.h"
-
-#include "caffe/blob.hpp"
-#include "caffe/common.hpp"
-#include "caffe/filler.hpp"
-#include "caffe/layers/multinomial_logistic_loss_layer.hpp"
-
-#include "caffe/test/test_caffe_main.hpp"
-#include "caffe/test/test_gradient_check_util.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-class MultinomialLogisticLossLayerTest : public CPUDeviceTest<Dtype> {
- protected:
-  MultinomialLogisticLossLayerTest()
-      : blob_bottom_data_(new Blob<Dtype>(10, 5, 1, 1)),
-        blob_bottom_label_(new Blob<Dtype>(10, 1, 1, 1)),
-        blob_top_loss_(new Blob<Dtype>()) {
-    Caffe::set_random_seed(1701);
-    // fill the values
-    FillerParameter filler_param;
-    PositiveUnitballFiller<Dtype> filler(filler_param);
-    filler.Fill(this->blob_bottom_data_);
-    blob_bottom_vec_.push_back(blob_bottom_data_);
-    for (int i = 0; i < blob_bottom_label_->count(); ++i) {
-      blob_bottom_label_->mutable_cpu_data()[i] = caffe_rng_rand() % 5;
-    }
-    blob_bottom_vec_.push_back(blob_bottom_label_);
-    blob_top_vec_.push_back(blob_top_loss_);
-  }
-  virtual ~MultinomialLogisticLossLayerTest() {
-    delete blob_bottom_data_;
-    delete blob_bottom_label_;
-    delete blob_top_loss_;
-  }
-  Blob<Dtype>* const blob_bottom_data_;
-  Blob<Dtype>* const blob_bottom_label_;
-  Blob<Dtype>* const blob_top_loss_;
-  vector<Blob<Dtype>*> blob_bottom_vec_;
-  vector<Blob<Dtype>*> blob_top_vec_;
-};
-
-TYPED_TEST_CASE(MultinomialLogisticLossLayerTest, TestDtypes);
-
-
-TYPED_TEST(MultinomialLogisticLossLayerTest, TestGradientCPU) {
-  LayerParameter layer_param;
-  MultinomialLogisticLossLayer<TypeParam> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  GradientChecker<TypeParam> checker(1e-2, 2*1e-2, 1701, 0, 0.05);
-  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_, 0);
-}
-
-}  // namespace caffe
diff --git a/src/caffe/test/test_mvn_layer.cpp b/src/caffe/test/test_mvn_layer.cpp
deleted file mode 100644
index 28a762d..0000000
--- a/src/caffe/test/test_mvn_layer.cpp
+++ /dev/null
@@ -1,172 +0,0 @@
-#include <vector>
-
-#include "caffe/blob.hpp"
-#include "caffe/common.hpp"
-#include "caffe/filler.hpp"
-#include "caffe/layers/mvn_layer.hpp"
-#include "google/protobuf/text_format.h"
-#include "gtest/gtest.h"
-
-#include "caffe/test/test_caffe_main.hpp"
-#include "caffe/test/test_gradient_check_util.hpp"
-
-namespace caffe {
-
-template <typename TypeParam>
-class MVNLayerTest : public MultiDeviceTest<TypeParam> {
-  typedef typename TypeParam::Dtype Dtype;
- protected:
-  MVNLayerTest()
-      : blob_bottom_(new Blob<Dtype>(2, 3, 4, 5)),
-        blob_top_(new Blob<Dtype>()) {
-    // fill the values
-    FillerParameter filler_param;
-    GaussianFiller<Dtype> filler(filler_param);
-    filler.Fill(this->blob_bottom_);
-    blob_bottom_vec_.push_back(blob_bottom_);
-    blob_top_vec_.push_back(blob_top_);
-  }
-  virtual ~MVNLayerTest() { delete blob_bottom_; delete blob_top_; }
-  Blob<Dtype>* const blob_bottom_;
-  Blob<Dtype>* const blob_top_;
-  vector<Blob<Dtype>*> blob_bottom_vec_;
-  vector<Blob<Dtype>*> blob_top_vec_;
-};
-
-TYPED_TEST_CASE(MVNLayerTest, TestDtypesAndDevices);
-
-TYPED_TEST(MVNLayerTest, TestForward) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  MVNLayer<Dtype> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  // Test mean
-  int num = this->blob_bottom_->num();
-  int channels = this->blob_bottom_->channels();
-  int height = this->blob_bottom_->height();
-  int width = this->blob_bottom_->width();
-
-  for (int i = 0; i < num; ++i) {
-    for (int j = 0; j < channels; ++j) {
-      Dtype sum = 0, var = 0;
-      for (int k = 0; k < height; ++k) {
-        for (int l = 0; l < width; ++l) {
-          Dtype data = this->blob_top_->data_at(i, j, k, l);
-          sum += data;
-          var += data * data;
-        }
-      }
-      sum /= height * width;
-      var /= height * width;
-
-      const Dtype kErrorBound = 0.001;
-      // expect zero mean
-      EXPECT_NEAR(0, sum, kErrorBound);
-      // expect unit variance
-      EXPECT_NEAR(1, var, kErrorBound);
-    }
-  }
-}
-
-TYPED_TEST(MVNLayerTest, TestForwardMeanOnly) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  CHECK(google::protobuf::TextFormat::ParseFromString(
-      "mvn_param{normalize_variance: false}", &layer_param));
-  MVNLayer<Dtype> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  // Test mean
-  int num = this->blob_bottom_->num();
-  int channels = this->blob_bottom_->channels();
-  int height = this->blob_bottom_->height();
-  int width = this->blob_bottom_->width();
-
-  for (int i = 0; i < num; ++i) {
-    for (int j = 0; j < channels; ++j) {
-      Dtype sum = 0, var = 0;
-      for (int k = 0; k < height; ++k) {
-        for (int l = 0; l < width; ++l) {
-          Dtype data = this->blob_top_->data_at(i, j, k, l);
-          sum += data;
-          var += data * data;
-        }
-      }
-      sum /= height * width;
-
-      const Dtype kErrorBound = 0.001;
-      // expect zero mean
-      EXPECT_NEAR(0, sum, kErrorBound);
-    }
-  }
-}
-
-TYPED_TEST(MVNLayerTest, TestForwardAcrossChannels) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  CHECK(google::protobuf::TextFormat::ParseFromString(
-      "mvn_param{across_channels: true}", &layer_param));
-  MVNLayer<Dtype> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  // Test mean
-  int num = this->blob_bottom_->num();
-  int channels = this->blob_bottom_->channels();
-  int height = this->blob_bottom_->height();
-  int width = this->blob_bottom_->width();
-
-  for (int i = 0; i < num; ++i) {
-    Dtype sum = 0, var = 0;
-    for (int j = 0; j < channels; ++j) {
-      for (int k = 0; k < height; ++k) {
-        for (int l = 0; l < width; ++l) {
-          Dtype data = this->blob_top_->data_at(i, j, k, l);
-          sum += data;
-          var += data * data;
-        }
-      }
-    }
-    sum /= height * width * channels;
-    var /= height * width * channels;
-
-    const Dtype kErrorBound = 0.001;
-    // expect zero mean
-    EXPECT_NEAR(0, sum, kErrorBound);
-    // expect unit variance
-    EXPECT_NEAR(1, var, kErrorBound);
-  }
-}
-
-TYPED_TEST(MVNLayerTest, TestGradient) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  MVNLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-2, 1e-3);
-  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_);
-}
-
-TYPED_TEST(MVNLayerTest, TestGradientMeanOnly) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  CHECK(google::protobuf::TextFormat::ParseFromString(
-      "mvn_param{normalize_variance: false}", &layer_param));
-  MVNLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-2, 1e-3);
-  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_);
-}
-
-TYPED_TEST(MVNLayerTest, TestGradientAcrossChannels) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  CHECK(google::protobuf::TextFormat::ParseFromString(
-      "mvn_param{across_channels: true}", &layer_param));
-  MVNLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-2, 1e-3);
-  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_);
-}
-
-}  // namespace caffe
diff --git a/src/caffe/test/test_net.cpp b/src/caffe/test/test_net.cpp
deleted file mode 100644
index 92fd317..0000000
--- a/src/caffe/test/test_net.cpp
+++ /dev/null
@@ -1,2476 +0,0 @@
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "google/protobuf/text_format.h"
-
-#include "gtest/gtest.h"
-
-#include "caffe/common.hpp"
-#include "caffe/filler.hpp"
-#include "caffe/net.hpp"
-#include "caffe/util/math_functions.hpp"
-
-#include "caffe/test/test_caffe_main.hpp"
-#include "caffe/test/test_gradient_check_util.hpp"
-
-namespace caffe {
-
-template <typename TypeParam>
-class NetTest : public MultiDeviceTest<TypeParam> {
-  typedef typename TypeParam::Dtype Dtype;
-
- protected:
-  NetTest() : seed_(1701) {}
-
-  virtual void InitNetFromProtoString(const string& proto) {
-    NetParameter param;
-    CHECK(google::protobuf::TextFormat::ParseFromString(proto, &param));
-    net_.reset(new Net<Dtype>(param));
-  }
-
-  virtual void CopyNetBlobs(const bool copy_diff,
-      vector<shared_ptr<Blob<Dtype> > >* blobs_copy) {
-    CHECK(net_);
-    const vector<shared_ptr<Blob<Dtype> > >& net_blobs = net_->blobs();
-    blobs_copy->clear();
-    blobs_copy->resize(net_blobs.size());
-    const bool kReshape = true;
-    for (int i = 0; i < net_blobs.size(); ++i) {
-      (*blobs_copy)[i].reset(new Blob<Dtype>());
-      (*blobs_copy)[i]->CopyFrom(*net_blobs[i], copy_diff, kReshape);
-    }
-  }
-
-  virtual void CopyNetParams(const bool copy_diff,
-      vector<shared_ptr<Blob<Dtype> > >* params_copy) {
-    CHECK(net_);
-    const vector<shared_ptr<Blob<Dtype> > >& net_params = net_->params();
-    params_copy->clear();
-    params_copy->resize(net_params.size());
-    const bool kReshape = true;
-    for (int i = 0; i < net_params.size(); ++i) {
-      (*params_copy)[i].reset(new Blob<Dtype>());
-      (*params_copy)[i]->CopyFrom(*net_params[i], copy_diff, kReshape);
-    }
-  }
-
-  virtual void InitTinyNet(const bool force_backward = false,
-                           const bool accuracy_layer = false) {
-    string proto =
-        "name: 'TinyTestNetwork' "
-        "layer { "
-        "  name: 'data' "
-        "  type: 'DummyData' "
-        "  dummy_data_param { "
-        "    shape { "
-        "      dim: 5 "
-        "      dim: 2 "
-        "      dim: 3 "
-        "      dim: 4 "
-        "    } "
-        "    data_filler { "
-        "      type: 'gaussian' "
-        "      std: 0.01 "
-        "    } "
-        "    shape { "
-        "      dim: 5 "
-        "    } "
-        "    data_filler { "
-        "      type: 'constant' "
-        "      value: 0 "
-        "    } "
-        "  } "
-        "  top: 'data' "
-        "  top: 'label' "
-        "} "
-        "layer { "
-        "  name: 'innerproduct' "
-        "  type: 'InnerProduct' "
-        "  inner_product_param { "
-        "    num_output: 1000 "
-        "    weight_filler { "
-        "      type: 'gaussian' "
-        "      std: 0.01 "
-        "    } "
-        "    bias_filler { "
-        "      type: 'constant' "
-        "      value: 0 "
-        "    } "
-        "  } "
-        "  param { "
-        "    lr_mult: 1 "
-        "    decay_mult: 1 "
-        "  } "
-        "  param { "
-        "    lr_mult: 2 "
-        "    decay_mult: 0 "
-        "  } "
-        "  bottom: 'data' "
-        "  top: 'innerproduct' "
-        "} "
-        "layer { "
-        "  name: 'loss' "
-        "  type: 'SoftmaxWithLoss' "
-        "  bottom: 'innerproduct' "
-        "  bottom: 'label' "
-        "  top: 'top_loss' "
-        "} ";
-    if (accuracy_layer) {
-      proto +=
-          "layer { "
-          "  name: 'loss' "
-          "  type: 'Accuracy' "
-          "  bottom: 'innerproduct' "
-          "  bottom: 'label' "
-          "  top: 'accuracy' "
-          "} ";
-    }
-    if (force_backward) {
-      proto += "force_backward: true ";
-    }
-    InitNetFromProtoString(proto);
-  }
-
-  virtual void InitTinyNetEuclidean(const bool force_backward = false) {
-    string proto =
-        "name: 'TinyTestEuclidLossNetwork' "
-        "layer { "
-        "  name: 'data' "
-        "  type: 'DummyData' "
-        "  dummy_data_param { "
-        "    num: 5 "
-        "    channels: 2 "
-        "    height: 3 "
-        "    width: 4 "
-        "    num: 5 "
-        "    channels: 1 "
-        "    height: 1 "
-        "    width: 1 "
-        "    data_filler { "
-        "      type: 'gaussian' "
-        "      std: 0.01 "
-        "    } "
-        "  } "
-        "  top: 'data' "
-        "  top: 'label' "
-        "} "
-        "layer { "
-        "  name: 'innerproduct' "
-        "  type: 'InnerProduct' "
-        "  inner_product_param { "
-        "    num_output: 1 "
-        "    weight_filler { "
-        "      type: 'gaussian' "
-        "      std: 0.01 "
-        "    } "
-        "    bias_filler { "
-        "      type: 'constant' "
-        "      value: 0 "
-        "    } "
-        "  } "
-        "  param { "
-        "    lr_mult: 1 "
-        "    decay_mult: 1 "
-        "  } "
-        "  param { "
-        "    lr_mult: 2 "
-        "    decay_mult: 0 "
-        "  } "
-        "  bottom: 'data' "
-        "  top: 'innerproduct' "
-        "} "
-        "layer { "
-        "  name: 'loss' "
-        "  type: 'EuclideanLoss' "
-        "  bottom: 'innerproduct' "
-        "  bottom: 'label' "
-        "} ";
-    if (force_backward) {
-      proto += "force_backward: true ";
-    }
-    InitNetFromProtoString(proto);
-  }
-
-  virtual void InitTrickyNet(Dtype* loss_weight = NULL) {
-    ostringstream loss_weight_stream;
-    if (loss_weight) {
-      loss_weight_stream << "  loss_weight: " << *loss_weight << " ";
-    }
-    const string& proto =
-        "name: 'TrickyTestNetwork' "
-        "layer { "
-        "  name: 'data' "
-        "  type: 'DummyData' "
-        "  dummy_data_param { "
-        "    num: 5 "
-        "    channels: 2 "
-        "    height: 3 "
-        "    width: 4 "
-        "    num: 5 "
-        "    channels: 1 "
-        "    height: 1 "
-        "    width: 1 "
-        "    data_filler { "
-        "      type: 'gaussian' "
-        "      std: 0.01 "
-        "    } "
-        "  } "
-        "  top: 'data' "
-        "  top: 'label' "
-        "} "
-        "layer { "
-        "  name: 'innerproduct' "
-        "  type: 'InnerProduct' "
-        "  inner_product_param { "
-        "    num_output: 1000 "
-        "    weight_filler { "
-        "      type: 'gaussian' "
-        "      std: 0.01 "
-        "    } "
-        "    bias_filler { "
-        "      type: 'constant' "
-        "      value: 0 "
-        "    } "
-        "  } "
-        "  param { "
-        "    lr_mult: 1 "
-        "    decay_mult: 1 "
-        "  } "
-        "  param { "
-        "    lr_mult: 2 "
-        "    decay_mult: 0 "
-        "  } "
-        "  bottom: 'data' "
-        "  top: 'transformed_data' "
-        "} "
-        "layer { "
-        "  name: 'innerproduct' "
-        "  type: 'InnerProduct' "
-        "  inner_product_param { "
-        "    num_output: 1 "
-        "    weight_filler { "
-        "      type: 'gaussian' "
-        "      std: 0.01 "
-        "    } "
-        "    bias_filler { "
-        "      type: 'constant' "
-        "      value: 0 "
-        "    } "
-        "  } "
-        "  param { "
-        "    lr_mult: 1 "
-        "    decay_mult: 1 "
-        "  } "
-        "  param { "
-        "    lr_mult: 2 "
-        "    decay_mult: 0 "
-        "  } "
-        "  bottom: 'label' "
-        "  top: 'transformed_label' "
-        "} "
-        "layer { "
-        "  name: 'loss' "
-        "  type: 'SoftmaxWithLoss' " +
-        loss_weight_stream.str() +
-        "  bottom: 'transformed_data' "
-        "  bottom: 'transformed_label' "
-        "} ";
-    InitNetFromProtoString(proto);
-  }
-
-  // loss_weight is the loss weight for the 'EuclideanLoss' layer output.
-  // midnet_loss_weight is the loss weight for the first 'InnerProduct' layer
-  // output.  Should both default to 0.0 if unspecified (i.e., if NULL is
-  // passed to this function).
-  virtual void InitUnsharedWeightsNet(const Dtype* loss_weight = NULL,
-      const Dtype* midnet_loss_weight = NULL,
-      const bool force_backward = false, const bool bias_term = false,
-      const Dtype blobs_lr_w1 = 1, const Dtype blobs_lr_b1 = 2,
-      const Dtype blobs_lr_w2 = 1, const Dtype blobs_lr_b2 = 2) {
-    string bias_str = bias_term ? "true ":"false ";
-    ostringstream proto;
-    proto << "name: 'UnsharedWeightsNetwork' ";
-    if (force_backward) {
-      proto << "force_backward: true ";
-    }
-    proto <<
-        "layer { "
-        "  name: 'data' "
-        "  type: 'DummyData' "
-        "  dummy_data_param { "
-        "    num: 5 "
-        "    channels: 2 "
-        "    height: 3 "
-        "    width: 4 "
-        "    data_filler { "
-        "      type: 'gaussian' "
-        "      std: 0.01 "
-        "    } "
-        "  } "
-        "  top: 'data' "
-        "} "
-        "layer { "
-        "  name: 'innerproduct1' "
-        "  type: 'InnerProduct' "
-        "  inner_product_param { "
-        "    num_output: 10 "
-        "    bias_term: " << bias_str <<
-        "    weight_filler { "
-        "      type: 'gaussian' "
-        "      std: 10 "
-        "    } "
-        "  } "
-        "  param { "
-        "    name: 'unsharedweights1' "
-        "    lr_mult: " << blobs_lr_w1 <<
-        "  } ";
-    if (bias_term) {
-      proto << "  param { lr_mult: " << blobs_lr_b1 << " } ";
-    }
-    proto <<
-        "  bottom: 'data' "
-        "  top: 'innerproduct1' ";
-    if (midnet_loss_weight) {
-      proto << "  loss_weight: " << *midnet_loss_weight << " ";
-    }
-    proto <<
-        "} "
-        "layer { "
-        "  name: 'innerproduct2' "
-        "  type: 'InnerProduct' "
-        "  inner_product_param { "
-        "    num_output: 10 "
-        "    bias_term: " << bias_str <<
-        "    weight_filler { "
-        "      type: 'gaussian' "
-        "      std: 10 "
-        "    } "
-        "  } "
-        "  param { "
-        "    name: 'unsharedweights2' "
-        "    lr_mult: " << blobs_lr_w2 <<
-        "  } ";
-    if (bias_term) {
-      proto << "  param { lr_mult: " << blobs_lr_b2 << " } ";
-    }
-    proto <<
-        "  bottom: 'data' "
-        "  top: 'innerproduct2' "
-        "} "
-        "layer { "
-        "  name: 'loss' "
-        "  type: 'EuclideanLoss' ";
-    if (loss_weight) {
-      proto << "  loss_weight: " << *loss_weight << " ";
-    }
-    proto <<
-        "  bottom: 'innerproduct1' "
-        "  bottom: 'innerproduct2' "
-        "} ";
-    InitNetFromProtoString(proto.str());
-  }
-
-  virtual void InitSharedWeightsNet() {
-    const string& proto =
-        "name: 'SharedWeightsNetwork' "
-        "layer { "
-        "  name: 'data' "
-        "  type: 'DummyData' "
-        "  dummy_data_param { "
-        "    num: 5 "
-        "    channels: 2 "
-        "    height: 3 "
-        "    width: 4 "
-        "    data_filler { "
-        "      type: 'gaussian' "
-        "      std: 0.01 "
-        "    } "
-        "  } "
-        "  top: 'data' "
-        "} "
-        "layer { "
-        "  name: 'innerproduct1' "
-        "  type: 'InnerProduct' "
-        "  inner_product_param { "
-        "    num_output: 10 "
-        "    bias_term: false "
-        "    weight_filler { "
-        "      type: 'gaussian' "
-        "      std: 10 "
-        "    } "
-        "  } "
-        "  param { name: 'sharedweights' } "
-        "  bottom: 'data' "
-        "  top: 'innerproduct1' "
-        "} "
-        "layer { "
-        "  name: 'innerproduct2' "
-        "  type: 'InnerProduct' "
-        "  inner_product_param { "
-        "    num_output: 10 "
-        "    bias_term: false "
-        "    weight_filler { "
-        "      type: 'gaussian' "
-        "      std: 10 "
-        "    } "
-        "  } "
-        "  param { name: 'sharedweights' } "
-        "  bottom: 'data' "
-        "  top: 'innerproduct2' "
-        "} "
-        "layer { "
-        "  name: 'loss' "
-        "  type: 'EuclideanLoss' "
-        "  bottom: 'innerproduct1' "
-        "  bottom: 'innerproduct2' "
-        "} ";
-    InitNetFromProtoString(proto);
-  }
-
-  virtual void InitDiffDataUnsharedWeightsNet() {
-    const string& proto =
-        "name: 'DiffDataUnsharedWeightsNetwork' "
-        "layer { "
-        "  name: 'data' "
-        "  type: 'DummyData' "
-        "  dummy_data_param { "
-        "    num: 10 "
-        "    channels: 10 "
-        "    height: 1 "
-        "    width: 1 "
-        "    num: 10 "
-        "    channels: 10 "
-        "    height: 1 "
-        "    width: 1 "
-        "    data_filler { "
-        "      type: 'gaussian' "
-        "      std: 10 "
-        "    } "
-        "  } "
-        "  top: 'data1' "
-        "  top: 'data2' "
-        "} "
-        "layer { "
-        "  name: 'innerproduct1' "
-        "  type: 'InnerProduct' "
-        "  inner_product_param { "
-        "    num_output: 10 "
-        "    bias_term: false "
-        "    weight_filler { "
-        "      type: 'constant' "
-        "      value: 0.5 "
-        "    } "
-        "  } "
-        "  param { name: 'unsharedweights1' } "
-        "  bottom: 'data1' "
-        "  top: 'innerproduct1' "
-        "} "
-        "layer { "
-        "  name: 'innerproduct2' "
-        "  type: 'InnerProduct' "
-        "  inner_product_param { "
-        "    num_output: 10 "
-        "    bias_term: false "
-        "    weight_filler { "
-        "      type: 'constant' "
-        "      value: 0.5 "
-        "    } "
-        "  } "
-        "  param { name: 'unsharedweights2' } "
-        "  bottom: 'innerproduct1' "
-        "  top: 'innerproduct2' "
-        "} "
-        "layer { "
-        "  name: 'loss' "
-        "  type: 'EuclideanLoss' "
-        "  bottom: 'data2' "
-        "  bottom: 'innerproduct2' "
-        "} ";
-    InitNetFromProtoString(proto);
-  }
-
-  virtual void InitDiffDataSharedWeightsNet() {
-    const string& proto =
-        "name: 'DiffDataSharedWeightsNetwork' "
-        "layer { "
-        "  name: 'data' "
-        "  type: 'DummyData' "
-        "  dummy_data_param { "
-        "    num: 10 "
-        "    channels: 10 "
-        "    height: 1 "
-        "    width: 1 "
-        "    num: 10 "
-        "    channels: 10 "
-        "    height: 1 "
-        "    width: 1 "
-        "    data_filler { "
-        "      type: 'gaussian' "
-        "      std: 10 "
-        "    } "
-        "  } "
-        "  top: 'data1' "
-        "  top: 'data2' "
-        "} "
-        "layer { "
-        "  name: 'innerproduct1' "
-        "  type: 'InnerProduct' "
-        "  inner_product_param { "
-        "    num_output: 10 "
-        "    bias_term: false "
-        "    weight_filler { "
-        "      type: 'constant' "
-        "      value: 0.5 "
-        "    } "
-        "  } "
-        "  param { name: 'sharedweights' } "
-        "  bottom: 'data1' "
-        "  top: 'innerproduct1' "
-        "} "
-        "layer { "
-        "  name: 'innerproduct2' "
-        "  type: 'InnerProduct' "
-        "  inner_product_param { "
-        "    num_output: 10 "
-        "    bias_term: false "
-        "    weight_filler { "
-        "      type: 'constant' "
-        "      value: 0.5 "
-        "    } "
-        "  } "
-        "  param { name: 'sharedweights' } "
-        "  bottom: 'innerproduct1' "
-        "  top: 'innerproduct2' "
-        "} "
-        "layer { "
-        "  name: 'loss' "
-        "  type: 'EuclideanLoss' "
-        "  bottom: 'data2' "
-        "  bottom: 'innerproduct2' "
-        "} ";
-    InitNetFromProtoString(proto);
-  }
-
-  virtual void InitReshapableNet() {
-    const string& proto =
-        "name: 'ReshapableNetwork' "
-        "layer { "
-        "  name: 'data' "
-        "  type: 'Input' "
-        "  top: 'data' "
-        "  input_param { "
-        "  shape: { dim: 1 dim: 3 dim: 100 dim: 100 } "
-        "  } "
-        "} "
-        "layer { "
-        "  name: 'conv1' "
-        "  type: 'Convolution' "
-        "  bottom: 'data' "
-        "  top: 'conv1' "
-        "  convolution_param { "
-        "    num_output: 5 "
-        "    kernel_size: 3 "
-        "    stride: 2 "
-        "    weight_filler { "
-        "      type: 'gaussian' "
-        "      std: 0.01 "
-        "    } "
-        "    bias_filler { "
-        "      type: 'constant' "
-        "      value: 0.2 "
-        "    } "
-        "  } "
-        "} "
-        "layer { "
-        "  name: 'relu1' "
-        "  type: 'ReLU' "
-        "  bottom: 'conv1' "
-        "  top: 'conv1' "
-        "} "
-        "layer { "
-        "  name: 'pool1' "
-        "  type: 'Pooling' "
-        "  bottom: 'conv1' "
-        "  top: 'pool1' "
-        "  pooling_param { "
-        "    pool: MAX "
-        "    kernel_size: 2 "
-        "    stride: 2 "
-        "  } "
-        "} "
-        "layer { "
-        "  name: 'norm1' "
-        "  type: 'LRN' "
-        "  bottom: 'pool1' "
-        "  top: 'norm1' "
-        "  lrn_param { "
-        "    local_size: 3 "
-        "  } "
-        "} "
-        "layer { "
-        "  name: 'softmax' "
-        "  type: 'Softmax' "
-        "  bottom: 'norm1' "
-        "  top: 'softmax' "
-        "} ";
-    InitNetFromProtoString(proto);
-  }
-
-  virtual void InitSkipPropNet(bool test_skip_true) {
-    string proto =
-      "name: 'SkipPropTestNetwork' "
-      "layer { "
-      "  name: 'data' "
-      "  type: 'DummyData' "
-      "  dummy_data_param { "
-      "    shape { "
-      "      dim: 5 "
-      "      dim: 2 "
-      "      dim: 3 "
-      "      dim: 4 "
-      "    } "
-      "    data_filler { "
-      "      type: 'gaussian' "
-      "      std: 0.01 "
-      "    } "
-      "    shape { "
-      "      dim: 5 "
-      "    } "
-      "    data_filler { "
-      "      type: 'constant' "
-      "      value: 0 "
-      "    } "
-      "  } "
-      "  top: 'data' "
-      "  top: 'label' "
-      "} "
-      "layer { "
-      "  name: 'silence' "
-      "  bottom: 'label' "
-      "  type: 'Silence' "
-      "} "
-      "layer { "
-      "  name: 'innerproduct' "
-      "  type: 'InnerProduct' "
-      "  inner_product_param { "
-      "    num_output: 1 "
-      "    weight_filler { "
-      "      type: 'gaussian' "
-      "      std: 0.01 "
-      "    } "
-      "    bias_filler { "
-      "      type: 'constant' "
-      "      value: 0 "
-      "    } "
-      "  } "
-      "  param { "
-      "    lr_mult: 1 "
-      "    decay_mult: 1 "
-      "  } "
-      "  param { "
-      "    lr_mult: 2 "
-      "    decay_mult: 0 "
-      "  } "
-      "  bottom: 'data' "
-      "  top: 'innerproduct' "
-      "} "
-      "layer { "
-      "  name: 'ip_fake_labels' "
-      "  type: 'InnerProduct' "
-      "  inner_product_param { "
-      "    num_output: 1 "
-      "    weight_filler { "
-      "      type: 'gaussian' "
-      "      std: 0.01 "
-      "    } "
-      "    bias_filler { "
-      "      type: 'constant' "
-      "      value: 0 "
-      "    } "
-      "  } "
-      "  bottom: 'data' "
-      "  top: 'fake_labels' "
-      "} "
-      "layer { "
-      "  name: 'argmax' "
-      "  bottom: 'fake_labels' "
-      "  top: 'label_argmax' "
-      "  type: 'ArgMax' "
-      "} "
-      "layer { "
-      "  name: 'loss' "
-      "  bottom: 'innerproduct' "
-      "  bottom: 'label_argmax' ";
-    if (test_skip_true)
-      proto += "  propagate_down: true "
-               "  propagate_down: false ";
-    else
-      proto += "  propagate_down: true "
-               "  propagate_down: true ";
-    proto +=
-      "  top: 'cross_entropy_loss' "
-      "  type: 'SigmoidCrossEntropyLoss' "
-      "  loss_weight: 0.1 "
-      "} ";
-    InitNetFromProtoString(proto);
-  }
-
-  virtual void InitForcePropNet(bool test_force_true) {
-    string proto =
-      "name: 'ForcePropTestNetwork' "
-      "layer { "
-      "  name: 'data' "
-      "  type: 'DummyData' "
-      "  dummy_data_param { "
-      "    shape { "
-      "      dim: 5 "
-      "      dim: 2 "
-      "      dim: 3 "
-      "      dim: 4 "
-      "    } "
-      "    data_filler { "
-      "      type: 'gaussian' "
-      "      std: 0.01 "
-      "    } "
-      "    shape { "
-      "      dim: 5 "
-      "    } "
-      "    data_filler { "
-      "      type: 'constant' "
-      "      value: 0 "
-      "    } "
-      "  } "
-      "  top: 'data' "
-      "  top: 'label' "
-      "} "
-      "layer { "
-      "  name: 'innerproduct' "
-      "  type: 'InnerProduct' "
-      "  inner_product_param { "
-      "    num_output: 1 "
-      "    weight_filler { "
-      "      type: 'gaussian' "
-      "      std: 0.01 "
-      "    } "
-      "  } "
-      "  bottom: 'data' "
-      "  top: 'innerproduct' ";
-    if (test_force_true) {
-      proto += "  propagate_down: true ";
-    }
-    proto +=
-      "} "
-      "layer { "
-      "  name: 'loss' "
-      "  bottom: 'innerproduct' "
-      "  bottom: 'label' "
-      "  top: 'cross_entropy_loss' "
-      "  type: 'SigmoidCrossEntropyLoss' "
-      "} ";
-    InitNetFromProtoString(proto);
-  }
-
-  int seed_;
-  shared_ptr<Net<Dtype> > net_;
-};
-
-TYPED_TEST_CASE(NetTest, TestDtypesAndDevices);
-
-TYPED_TEST(NetTest, TestHasBlob) {
-  this->InitTinyNet();
-  EXPECT_TRUE(this->net_->has_blob("data"));
-  EXPECT_TRUE(this->net_->has_blob("label"));
-  EXPECT_TRUE(this->net_->has_blob("innerproduct"));
-  EXPECT_FALSE(this->net_->has_blob("loss"));
-  EXPECT_TRUE(this->net_->has_blob("top_loss"));
-}
-
-TYPED_TEST(NetTest, TestGetBlob) {
-  this->InitTinyNet();
-  EXPECT_EQ(this->net_->blob_by_name("data"), this->net_->blobs()[0]);
-  EXPECT_EQ(this->net_->blob_by_name("label"), this->net_->blobs()[1]);
-  EXPECT_EQ(this->net_->blob_by_name("innerproduct"), this->net_->blobs()[2]);
-  EXPECT_FALSE(this->net_->blob_by_name("loss"));
-  EXPECT_EQ(this->net_->blob_by_name("top_loss"), this->net_->blobs()[3]);
-}
-
-TYPED_TEST(NetTest, TestHasLayer) {
-  this->InitTinyNet();
-  EXPECT_TRUE(this->net_->has_layer("data"));
-  EXPECT_TRUE(this->net_->has_layer("innerproduct"));
-  EXPECT_TRUE(this->net_->has_layer("loss"));
-  EXPECT_FALSE(this->net_->has_layer("label"));
-}
-
-TYPED_TEST(NetTest, TestGetLayerByName) {
-  this->InitTinyNet();
-  EXPECT_EQ(this->net_->layer_by_name("data"), this->net_->layers()[0]);
-  EXPECT_EQ(this->net_->layer_by_name("innerproduct"), this->net_->layers()[1]);
-  EXPECT_EQ(this->net_->layer_by_name("loss"), this->net_->layers()[2]);
-  EXPECT_FALSE(this->net_->layer_by_name("label"));
-}
-
-TYPED_TEST(NetTest, TestBottomNeedBackward) {
-  this->InitTinyNet();
-  const vector<vector<bool> >& bottom_need_backward =
-      this->net_->bottom_need_backward();
-  EXPECT_EQ(3, bottom_need_backward.size());
-  EXPECT_EQ(0, bottom_need_backward[0].size());
-  EXPECT_EQ(1, bottom_need_backward[1].size());
-  EXPECT_EQ(false, bottom_need_backward[1][0]);
-  EXPECT_EQ(2, bottom_need_backward[2].size());
-  EXPECT_EQ(true, bottom_need_backward[2][0]);
-  EXPECT_EQ(false, bottom_need_backward[2][1]);
-}
-
-TYPED_TEST(NetTest, TestBottomNeedBackwardForce) {
-  const bool force_backward = true;
-  this->InitTinyNet(force_backward);
-  const vector<vector<bool> >& bottom_need_backward =
-      this->net_->bottom_need_backward();
-  EXPECT_EQ(3, bottom_need_backward.size());
-  EXPECT_EQ(0, bottom_need_backward[0].size());
-  EXPECT_EQ(1, bottom_need_backward[1].size());
-  EXPECT_EQ(true, bottom_need_backward[1][0]);
-  EXPECT_EQ(2, bottom_need_backward[2].size());
-  EXPECT_EQ(true, bottom_need_backward[2][0]);
-  EXPECT_EQ(false, bottom_need_backward[2][1]);
-}
-
-TYPED_TEST(NetTest, TestBottomNeedBackwardEuclideanForce) {
-  const bool force_backward = true;
-  this->InitTinyNetEuclidean(force_backward);
-  const vector<vector<bool> >& bottom_need_backward =
-      this->net_->bottom_need_backward();
-  EXPECT_EQ(3, bottom_need_backward.size());
-  EXPECT_EQ(0, bottom_need_backward[0].size());
-  EXPECT_EQ(1, bottom_need_backward[1].size());
-  EXPECT_EQ(true, bottom_need_backward[1][0]);
-  EXPECT_EQ(2, bottom_need_backward[2].size());
-  EXPECT_EQ(true, bottom_need_backward[2][0]);
-  EXPECT_EQ(true, bottom_need_backward[2][1]);
-}
-
-TYPED_TEST(NetTest, TestBottomNeedBackwardTricky) {
-  this->InitTrickyNet();
-  const vector<vector<bool> >& bottom_need_backward =
-      this->net_->bottom_need_backward();
-  EXPECT_EQ(4, bottom_need_backward.size());
-  EXPECT_EQ(0, bottom_need_backward[0].size());
-  EXPECT_EQ(1, bottom_need_backward[1].size());
-  EXPECT_EQ(false, bottom_need_backward[1][0]);
-  EXPECT_EQ(1, bottom_need_backward[2].size());
-  EXPECT_EQ(false, bottom_need_backward[2][0]);
-  EXPECT_EQ(2, bottom_need_backward[3].size());
-  EXPECT_EQ(true, bottom_need_backward[3][0]);
-  // The label input to the SoftmaxLossLayer should say it "needs backward"
-  // since it has weights under it, even though we expect this to cause a crash
-  // at training/test time.
-  EXPECT_EQ(true, bottom_need_backward[3][1]);
-}
-
-TYPED_TEST(NetTest, TestLossWeight) {
-  typedef typename TypeParam::Dtype Dtype;
-  // First, compute the loss and gradients with no loss_weight specified.
-  // In this case, the loss weight for the 'EuclideanLoss' layer should default
-  // to 1.
-  vector<Blob<Dtype>*> bottom;
-  Caffe::set_random_seed(this->seed_);
-  const bool kForceBackward = true;
-  this->InitUnsharedWeightsNet(NULL, NULL, kForceBackward);
-  const Dtype loss = this->net_->ForwardBackward();
-  const bool kCopyDiff = true;
-  vector<shared_ptr<Blob<Dtype> > > blob_grads;
-  this->CopyNetBlobs(kCopyDiff, &blob_grads);
-  vector<shared_ptr<Blob<Dtype> > > param_grads;
-  this->CopyNetParams(kCopyDiff, &param_grads);
-  // Check that the loss is non-trivial, otherwise the test doesn't prove much.
-  const Dtype kMinLossAbsValue = 1e-2;
-  ASSERT_GE(fabs(loss), kMinLossAbsValue);
-  const Dtype kErrorMargin = 1e-4;
-  const int kNumLossWeights = 6;
-  Dtype kLossWeights[kNumLossWeights] = {2, 0, 1, -1, -2.5, 3.7};
-  for (int i = 0; i < kNumLossWeights; ++i) {
-    Caffe::set_random_seed(this->seed_);
-    this->InitUnsharedWeightsNet(&kLossWeights[i], NULL, kForceBackward);
-    const Dtype weighted_loss = this->net_->ForwardBackward();
-    const Dtype error_margin = kErrorMargin * fabs(kLossWeights[i]);
-    EXPECT_NEAR(loss * kLossWeights[i], weighted_loss, error_margin)
-        << "loss weight = " << kLossWeights[i];
-    const vector<shared_ptr<Blob<Dtype> > >& weighted_blobs =
-        this->net_->blobs();
-    ASSERT_EQ(blob_grads.size(), weighted_blobs.size());
-    for (int j = 0; j < blob_grads.size(); ++j) {
-      ASSERT_EQ(blob_grads[j]->count(), weighted_blobs[j]->count());
-      for (int k = 0; k < blob_grads[j]->count(); ++k) {
-        EXPECT_NEAR(blob_grads[j]->cpu_diff()[k] * kLossWeights[i],
-                    weighted_blobs[j]->cpu_diff()[k], error_margin);
-      }
-    }
-    const vector<shared_ptr<Blob<Dtype> > >& weighted_params =
-        this->net_->params();
-    ASSERT_EQ(param_grads.size(), weighted_params.size());
-    for (int j = 0; j < param_grads.size(); ++j) {
-      ASSERT_EQ(param_grads[j]->count(), weighted_params[j]->count());
-      for (int k = 0; k < param_grads[j]->count(); ++k) {
-        EXPECT_NEAR(param_grads[j]->cpu_diff()[k] * kLossWeights[i],
-                    weighted_params[j]->cpu_diff()[k], error_margin);
-      }
-    }
-  }
-}
-
-TYPED_TEST(NetTest, TestLossWeightMidNet) {
-  typedef typename TypeParam::Dtype Dtype;
-  Caffe::set_random_seed(this->seed_);
-  const bool kForceBackward = true;
-  Dtype loss_weight = 0;
-  Dtype midnet_loss_weight = 1;
-  this->InitUnsharedWeightsNet(&loss_weight, &midnet_loss_weight,
-                               kForceBackward);
-  const Dtype loss = this->net_->ForwardBackward();
-  const bool kCopyDiff = true;
-  const bool kReshape = true;
-  Blob<Dtype> data_grad;
-  data_grad.CopyFrom(*this->net_->blob_by_name("data"), kCopyDiff, kReshape);
-  // Check that the loss is non-trivial, otherwise the test doesn't prove much.
-  const Dtype kMinLossAbsValue = 1e-2;
-  ASSERT_GE(fabs(loss), kMinLossAbsValue);
-  const Dtype kErrorMargin = 1e-4;
-  const int kNumLossWeights = 6;
-  Dtype kLossWeights[kNumLossWeights] = {2, 0, 1, -1, -2.5, 3.7};
-  for (int i = 0; i < kNumLossWeights; ++i) {
-    Caffe::set_random_seed(this->seed_);
-    this->InitUnsharedWeightsNet(&loss_weight, &kLossWeights[i],
-                                 kForceBackward);
-    const Dtype weighted_loss = this->net_->ForwardBackward();
-    const Dtype error_margin = kErrorMargin * fabs(kLossWeights[i]);
-    EXPECT_NEAR(loss * kLossWeights[i], weighted_loss, error_margin)
-        << "loss weight = " << kLossWeights[i];
-    const shared_ptr<Blob<Dtype> >& weighted_blob =
-        this->net_->blob_by_name("data");
-    ASSERT_EQ(data_grad.count(), weighted_blob->count());
-    for (int j = 0; j < data_grad.count(); ++j) {
-      EXPECT_NEAR(data_grad.cpu_diff()[j] * kLossWeights[i],
-                  weighted_blob->cpu_diff()[j], error_margin);
-    }
-  }
-}
-
-TYPED_TEST(NetTest, TestComboLossWeight) {
-  typedef typename TypeParam::Dtype Dtype;
-  Dtype loss_weight;
-  Dtype midnet_loss_weight;
-  const bool kForceBackward = true;
-  const Dtype kErrorMargin = 1e-4;
-
-  // Get the loss and gradients with 'EuclideanLoss' weight 1,
-  // 'InnerProduct' weight 1.
-  loss_weight = 1;
-  midnet_loss_weight = 1;
-  Caffe::set_random_seed(this->seed_);
-  this->InitUnsharedWeightsNet(&loss_weight, &midnet_loss_weight,
-                               kForceBackward);
-  const Dtype loss = this->net_->ForwardBackward();
-  const bool kCopyDiff = true;
-  vector<shared_ptr<Blob<Dtype> > > blob_grads;
-  this->CopyNetBlobs(kCopyDiff, &blob_grads);
-  vector<shared_ptr<Blob<Dtype> > > param_grads;
-  this->CopyNetParams(kCopyDiff, &param_grads);
-
-  loss_weight = 2;
-  midnet_loss_weight = 1;
-  Caffe::set_random_seed(this->seed_);
-  this->InitUnsharedWeightsNet(&loss_weight, &midnet_loss_weight,
-                               kForceBackward);
-  const Dtype loss_main_2 = this->net_->ForwardBackward();
-  vector<shared_ptr<Blob<Dtype> > > blob_grads_loss_2;
-  this->CopyNetBlobs(kCopyDiff, &blob_grads_loss_2);
-  vector<shared_ptr<Blob<Dtype> > > param_grads_loss_2;
-  this->CopyNetParams(kCopyDiff, &param_grads_loss_2);
-
-  loss_weight = 3;
-  midnet_loss_weight = 1;
-  Caffe::set_random_seed(this->seed_);
-  this->InitUnsharedWeightsNet(&loss_weight, &midnet_loss_weight,
-                               kForceBackward);
-  const Dtype loss_main_3 = this->net_->ForwardBackward();
-  const vector<shared_ptr<Blob<Dtype> > >& blob_grads_loss_3 =
-      this->net_->blobs();
-  ASSERT_EQ(blob_grads.size(), blob_grads_loss_3.size());
-  ASSERT_EQ(blob_grads_loss_2.size(), blob_grads_loss_3.size());
-  for (int j = 0; j < blob_grads.size(); ++j) {
-    const string& blob_name = this->net_->blob_names()[j];
-    bool grad_should_change = true;
-    if (blob_name == "innerproduct1_innerproduct1_0_split_0") {
-      grad_should_change = false;
-    }
-    ASSERT_EQ(blob_grads[j]->count(), blob_grads_loss_3[j]->count());
-    ASSERT_EQ(blob_grads_loss_2[j]->count(), blob_grads_loss_3[j]->count());
-    for (int k = 0; k < blob_grads[j]->count(); ++k) {
-      const Dtype grad_diff_2 = blob_grads_loss_2[j]->cpu_diff()[k] -
-                                    blob_grads[j]->cpu_diff()[k];
-      const Dtype grad_diff_3 = blob_grads_loss_3[j]->cpu_diff()[k] -
-                                    blob_grads[j]->cpu_diff()[k];
-      if (grad_should_change) {
-        // Test non-triviality.
-        const Dtype kMinGradDiffAbsValue = 1e-4;
-        EXPECT_GT(fabs(grad_diff_2), kMinGradDiffAbsValue) << blob_name;
-        EXPECT_NEAR(2 * grad_diff_2, grad_diff_3, kErrorMargin) << blob_name;
-      } else {
-        EXPECT_EQ(0, grad_diff_2) << blob_name;
-        EXPECT_EQ(0, grad_diff_3) << blob_name;
-      }
-    }
-  }
-
-  loss_weight = 1;
-  midnet_loss_weight = 2;
-  Caffe::set_random_seed(this->seed_);
-  this->InitUnsharedWeightsNet(&loss_weight, &midnet_loss_weight,
-                               kForceBackward);
-  const Dtype loss_midnet_2 = this->net_->ForwardBackward();
-  this->CopyNetBlobs(kCopyDiff, &blob_grads_loss_2);
-  this->CopyNetParams(kCopyDiff, &param_grads_loss_2);
-
-  loss_weight = 1;
-  midnet_loss_weight = 3;
-  Caffe::set_random_seed(this->seed_);
-  this->InitUnsharedWeightsNet(&loss_weight, &midnet_loss_weight,
-                               kForceBackward);
-  const Dtype loss_midnet_3 = this->net_->ForwardBackward();
-  const vector<shared_ptr<Blob<Dtype> > >& blob_grads_midnet_loss_3 =
-      this->net_->blobs();
-  ASSERT_EQ(blob_grads.size(), blob_grads_midnet_loss_3.size());
-  ASSERT_EQ(blob_grads_loss_2.size(), blob_grads_midnet_loss_3.size());
-  const vector<string>& blob_names = this->net_->blob_names();
-  for (int j = 0; j < blob_grads.size(); ++j) {
-    const string& blob_name = blob_names[j];
-    bool grad_should_change = false;
-    if (blob_name == "innerproduct1" ||
-        blob_name == "innerproduct1_innerproduct1_0_split_0" ||
-        blob_name == "data_data_0_split_0" || blob_name == "data") {
-      grad_should_change = true;
-    }
-    ASSERT_EQ(blob_grads[j]->count(), blob_grads_midnet_loss_3[j]->count());
-    ASSERT_EQ(blob_grads[j]->count(), blob_grads_loss_2[j]->count());
-    for (int k = 0; k < blob_grads[j]->count(); ++k) {
-      const Dtype grad_diff_2 = blob_grads_loss_2[j]->cpu_diff()[k] -
-                                    blob_grads[j]->cpu_diff()[k];
-      const Dtype grad_diff_3 = blob_grads_midnet_loss_3[j]->cpu_diff()[k] -
-                                    blob_grads[j]->cpu_diff()[k];
-      if (grad_should_change) {
-        // Test non-triviality.
-        const Dtype kMinGradDiffAbsValue = 1e-4;
-        EXPECT_GT(fabs(grad_diff_2), kMinGradDiffAbsValue) << blob_name;
-        EXPECT_NEAR(2 * grad_diff_2, grad_diff_3, kErrorMargin) << blob_name;
-      } else {
-        EXPECT_EQ(0, grad_diff_2) << blob_name;
-        EXPECT_EQ(0, grad_diff_3) << blob_name;
-      }
-    }
-  }
-
-  const Dtype kMinLossDiffAbsValue = 1e-4;
-
-  Dtype loss_diff_2 = loss_main_2 - loss;
-  // Test non-triviality.
-  EXPECT_GT(fabs(loss_diff_2), kMinLossDiffAbsValue);
-  Dtype loss_diff_3 = loss_main_3 - loss;
-  EXPECT_NEAR(2 * loss_diff_2, loss_diff_3, kErrorMargin);
-
-  loss_diff_2 = loss_midnet_2 - loss;
-  // Test non-triviality.
-  EXPECT_GT(fabs(loss_diff_2), kMinLossDiffAbsValue);
-  loss_diff_3 = loss_midnet_3 - loss;
-  EXPECT_NEAR(2 * loss_diff_2, loss_diff_3, kErrorMargin);
-}
-
-TYPED_TEST(NetTest, TestBackwardWithAccuracyLayer) {
-  const bool kForceBackward = false;
-  const bool kAccuracyLayer = true;
-  this->InitTinyNet(kForceBackward, kAccuracyLayer);
-  EXPECT_TRUE(this->net_->has_blob("accuracy"));
-  // Test that we can do Backward even though we have an 'Accuracy' layer.
-  this->net_->ForwardBackward();
-}
-
-TYPED_TEST(NetTest, TestUnsharedWeightsDataNet) {
-  typedef typename TypeParam::Dtype Dtype;
-  this->InitUnsharedWeightsNet();
-  Dtype loss;
-  this->net_->Forward(&loss);
-  EXPECT_GT(loss, 0);
-}
-
-TYPED_TEST(NetTest, TestSharedWeightsDataNet) {
-  typedef typename TypeParam::Dtype Dtype;
-  this->InitSharedWeightsNet();
-  Dtype loss;
-  this->net_->Forward(&loss);
-  EXPECT_FLOAT_EQ(loss, 0);
-}
-
-TYPED_TEST(NetTest, TestUnsharedWeightsDiffNet) {
-  typedef typename TypeParam::Dtype Dtype;
-  this->InitUnsharedWeightsNet();
-  Net<Dtype>* net = this->net_.get();
-  net->Forward();
-  net->Backward();
-  Layer<Dtype>* ip1_layer = net->layer_by_name("innerproduct1").get();
-  Layer<Dtype>* ip2_layer = net->layer_by_name("innerproduct2").get();
-  const int count = ip1_layer->blobs()[0]->count();
-  const Dtype* grad1 = ip1_layer->blobs()[0]->cpu_diff();
-  const Dtype* grad2 = ip2_layer->blobs()[0]->cpu_diff();
-  for (int i = 0; i < count; ++i) {
-    EXPECT_GT(fabs(grad1[i]), 0);
-    EXPECT_FLOAT_EQ(-1 * grad1[i], grad2[i]);
-  }
-}
-
-TYPED_TEST(NetTest, TestSharedWeightsDiffNet) {
-  typedef typename TypeParam::Dtype Dtype;
-  this->InitSharedWeightsNet();
-  Net<Dtype>* net = this->net_.get();
-  Dtype loss;
-  net->Forward(&loss);
-  net->Backward();
-  EXPECT_FLOAT_EQ(loss, 0);
-  Layer<Dtype>* ip1_layer = net->layer_by_name("innerproduct1").get();
-  Layer<Dtype>* ip2_layer = net->layer_by_name("innerproduct2").get();
-  const int count = ip1_layer->blobs()[0]->count();
-  const Dtype* grad1 = ip1_layer->blobs()[0]->cpu_diff();
-  const Dtype* grad2 = ip2_layer->blobs()[0]->cpu_diff();
-  for (int i = 0; i < count; ++i) {
-    EXPECT_FLOAT_EQ(0, grad1[i]);
-    EXPECT_FLOAT_EQ(0, grad2[i]);
-  }
-}
-
-TYPED_TEST(NetTest, TestSharedWeightsUpdate) {
-  typedef typename TypeParam::Dtype Dtype;
-  Caffe::set_random_seed(this->seed_);
-  this->InitDiffDataSharedWeightsNet();
-  EXPECT_EQ(this->net_->layer_names()[1], "innerproduct1");
-  EXPECT_EQ(this->net_->layer_names()[2], "innerproduct2");
-  Blob<Dtype>* ip1_weights = this->net_->layers()[1]->blobs()[0].get();
-  Blob<Dtype>* ip2_weights = this->net_->layers()[2]->blobs()[0].get();
-  // Check that data and diff blobs of shared weights share the same memory
-  // locations.
-  EXPECT_EQ(ip1_weights->cpu_data(), ip2_weights->cpu_data());
-  EXPECT_EQ(ip1_weights->cpu_diff(), ip2_weights->cpu_diff());
-  this->net_->Forward();
-  this->net_->Backward();
-  // Compute the expected update as the data minus the two diffs.
-  Blob<Dtype> shared_params;
-  const bool reshape = true;
-  const bool copy_diff = false;
-  shared_params.CopyFrom(*ip1_weights, copy_diff, reshape);
-  shared_params.CopyFrom(*ip1_weights, !copy_diff, reshape);
-  const int count = ip1_weights->count();
-  // Make sure the diffs are non-trivial.
-  for (int i = 0; i < count; ++i) {
-    EXPECT_NE(0, ip1_weights->cpu_diff()[i]);
-  }
-  caffe_axpy(count, Dtype(-1), shared_params.cpu_diff(),
-             shared_params.mutable_cpu_data());
-  const Dtype* expected_updated_params = shared_params.cpu_data();
-  this->net_->Update();
-  const Dtype* actual_updated_params = ip1_weights->cpu_data();
-  for (int i = 0; i < count; ++i) {
-    EXPECT_EQ(expected_updated_params[i], actual_updated_params[i]);
-  }
-  // Check that data blobs of shared weights STILL point to the same memory
-  // location (because ... who knows).
-  EXPECT_EQ(ip1_weights->cpu_data(), ip2_weights->cpu_data());
-
-  Caffe::set_random_seed(this->seed_);
-  this->InitDiffDataUnsharedWeightsNet();
-  EXPECT_EQ(this->net_->layer_names()[1], "innerproduct1");
-  EXPECT_EQ(this->net_->layer_names()[2], "innerproduct2");
-  ip1_weights = this->net_->layers()[1]->blobs()[0].get();
-  ip2_weights = this->net_->layers()[2]->blobs()[0].get();
-  // Check that data and diff blobs of unshared weights are at different
-  // locations in memory.
-  EXPECT_NE(ip1_weights->cpu_data(), ip2_weights->cpu_data());
-  EXPECT_NE(ip1_weights->cpu_diff(), ip2_weights->cpu_diff());
-  this->net_->Forward();
-  this->net_->Backward();
-  // Compute the expected update.
-  Blob<Dtype> unshared_params1;
-  unshared_params1.CopyFrom(*ip1_weights, copy_diff, reshape);
-  unshared_params1.CopyFrom(*ip1_weights, !copy_diff, reshape);
-  Blob<Dtype> unshared_params2;
-  unshared_params2.CopyFrom(*ip2_weights, copy_diff, reshape);
-  unshared_params2.CopyFrom(*ip2_weights, !copy_diff, reshape);
-  // Make sure the diffs are non-trivial and sum to the diff in the shared net.
-  for (int i = 0; i < count; ++i) {
-    EXPECT_NE(0, ip1_weights->cpu_diff()[i]);
-    EXPECT_NE(0, ip2_weights->cpu_diff()[i]);
-    EXPECT_NE(ip1_weights->cpu_diff()[i], ip2_weights->cpu_diff()[i]);
-    EXPECT_FLOAT_EQ(ip1_weights->cpu_diff()[i] + ip2_weights->cpu_diff()[i],
-                    shared_params.cpu_diff()[i]);
-  }
-  caffe_axpy(count, Dtype(-1), ip1_weights->cpu_diff(),
-             unshared_params1.mutable_cpu_data());
-  caffe_axpy(count, Dtype(-1), ip2_weights->cpu_diff(),
-             unshared_params2.mutable_cpu_data());
-  const Dtype* expected_updated_params1 = unshared_params1.cpu_data();
-  const Dtype* expected_updated_params2 = unshared_params2.cpu_data();
-  this->net_->Update();
-  const Dtype* actual_updated_params1 = ip1_weights->cpu_data();
-  const Dtype* actual_updated_params2 = ip2_weights->cpu_data();
-  for (int i = 0; i < count; ++i) {
-    EXPECT_EQ(expected_updated_params1[i], actual_updated_params1[i]);
-    EXPECT_EQ(expected_updated_params2[i], actual_updated_params2[i]);
-    EXPECT_NE(actual_updated_params1[i], actual_updated_params2[i]);
-    EXPECT_NE(expected_updated_params, expected_updated_params1);
-  }
-}
-
-TYPED_TEST(NetTest, TestSharedWeightsResume) {
-  typedef typename TypeParam::Dtype Dtype;
-
-  // Create a net with weight sharing; Update it once.
-  Caffe::set_random_seed(this->seed_);
-  this->InitDiffDataSharedWeightsNet();
-  EXPECT_EQ(this->net_->layer_names()[1], "innerproduct1");
-  EXPECT_EQ(this->net_->layer_names()[2], "innerproduct2");
-  Blob<Dtype>* ip1_weights = this->net_->layers()[1]->blobs()[0].get();
-  Blob<Dtype>* ip2_weights = this->net_->layers()[2]->blobs()[0].get();
-  // Check that data and diff blobs of shared weights share the same memory
-  // locations.
-  EXPECT_EQ(ip1_weights->cpu_data(), ip2_weights->cpu_data());
-  EXPECT_EQ(ip1_weights->cpu_diff(), ip2_weights->cpu_diff());
-  this->net_->ForwardBackward();
-  this->net_->Update();
-  Blob<Dtype> shared_params;
-  const bool kReshape = true;
-  const bool kCopyDiff = false;
-  shared_params.CopyFrom(*ip1_weights, kCopyDiff, kReshape);
-  const int count = ip1_weights->count();
-
-  // Write the net to a NetParameter, as in Solver::Snapshot.
-  NetParameter net_param;
-  this->net_->ToProto(&net_param);
-
-  // Reinitialize the net and copy parameters from net_param, as in
-  // Solver::Restore.
-  Caffe::set_random_seed(this->seed_);
-  this->InitDiffDataSharedWeightsNet();
-  this->net_->CopyTrainedLayersFrom(net_param);
-  ip1_weights = this->net_->layers()[1]->blobs()[0].get();
-  ip2_weights = this->net_->layers()[2]->blobs()[0].get();
-  ASSERT_FALSE(NULL == ip1_weights);
-  ASSERT_FALSE(NULL == ip2_weights);
-  EXPECT_NE(ip1_weights, ip2_weights);
-  // Check that data and diff blobs of shared weights share the same memory
-  // locations.
-  EXPECT_EQ(ip1_weights->cpu_data(), ip2_weights->cpu_data());
-  EXPECT_EQ(ip1_weights->cpu_diff(), ip2_weights->cpu_diff());
-  for (int i = 0; i < count; ++i) {
-    EXPECT_FLOAT_EQ(shared_params.cpu_data()[i], ip1_weights->cpu_data()[i]);
-  }
-}
-
-TYPED_TEST(NetTest, TestParamPropagateDown) {
-  typedef typename TypeParam::Dtype Dtype;
-  const bool kBiasTerm = true, kForceBackward = false;
-  const Dtype* kLossWeight1 = NULL;
-  const Dtype* kLossWeight2 = NULL;
-
-  // Run the net with all params learned; check that gradients are non-zero.
-  Caffe::set_random_seed(this->seed_);
-  Dtype blobs_lr_w1 = 1, blobs_lr_w2 = 1, blobs_lr_b1 = 2, blobs_lr_b2 = 2;
-  this->InitUnsharedWeightsNet(kLossWeight1, kLossWeight2, kForceBackward,
-      kBiasTerm, blobs_lr_w1, blobs_lr_w2, blobs_lr_b1, blobs_lr_b2);
-  this->net_->Forward();
-  this->net_->Backward();
-  const vector<shared_ptr<Blob<Dtype> > >& params = this->net_->params();
-  const int num_params = params.size();
-  ASSERT_EQ(4, num_params);
-  const Dtype kNonZeroTestMin = 1e-3;
-  vector<Dtype> param_asums(params.size());
-  for (int i = 0; i < num_params; ++i) {
-    const Dtype param_asum =
-       caffe_cpu_asum(params[i]->count(), params[i]->cpu_diff());
-    param_asums[i] = param_asum;
-    EXPECT_GT(param_asum, kNonZeroTestMin);
-  }
-
-  // Change the learning rates to different non-zero values; should see same
-  // gradients.
-  Caffe::set_random_seed(this->seed_);
-  blobs_lr_w1 *= 2, blobs_lr_w2 *= 2, blobs_lr_b1 *= 2, blobs_lr_b2 *= 2;
-  this->InitUnsharedWeightsNet(kLossWeight1, kLossWeight2, kForceBackward,
-      kBiasTerm, blobs_lr_w1, blobs_lr_w2, blobs_lr_b1, blobs_lr_b2);
-  this->net_->Forward();
-  this->net_->Backward();
-  const vector<shared_ptr<Blob<Dtype> > >& params2 = this->net_->params();
-  ASSERT_EQ(num_params, params2.size());
-  for (int i = 0; i < num_params; ++i) {
-    const Dtype param_asum =
-       caffe_cpu_asum(params2[i]->count(), params2[i]->cpu_diff());
-    EXPECT_FLOAT_EQ(param_asum, param_asums[i]);
-  }
-
-  // Change a subset of the learning rates to zero; check that we see zero
-  // gradients for those.
-  Caffe::set_random_seed(this->seed_);
-  blobs_lr_w1 = 1, blobs_lr_w2 = 0, blobs_lr_b1 = 0, blobs_lr_b2 = 1;
-  this->InitUnsharedWeightsNet(kLossWeight1, kLossWeight2, kForceBackward,
-      kBiasTerm, blobs_lr_w1, blobs_lr_w2, blobs_lr_b1, blobs_lr_b2);
-  this->net_->Forward();
-  this->net_->Backward();
-  const vector<shared_ptr<Blob<Dtype> > >& params3 = this->net_->params();
-  ASSERT_EQ(num_params, params3.size());
-  for (int i = 0; i < num_params; ++i) {
-    const Dtype param_asum =
-       caffe_cpu_asum(params3[i]->count(), params3[i]->cpu_diff());
-    if (i == 1 || i == 2) {
-      EXPECT_FLOAT_EQ(0, param_asum);
-    } else {
-      EXPECT_FLOAT_EQ(param_asum, param_asums[i]);
-    }
-  }
-
-  // Change the opposite subset of the learning rates to zero.
-  Caffe::set_random_seed(this->seed_);
-  blobs_lr_w1 = 0, blobs_lr_w2 = 1, blobs_lr_b1 = 1, blobs_lr_b2 = 0;
-  this->InitUnsharedWeightsNet(kLossWeight1, kLossWeight2, kForceBackward,
-      kBiasTerm, blobs_lr_w1, blobs_lr_w2, blobs_lr_b1, blobs_lr_b2);
-  this->net_->Forward();
-  this->net_->Backward();
-  const vector<shared_ptr<Blob<Dtype> > >& params4 = this->net_->params();
-  ASSERT_EQ(num_params, params4.size());
-  for (int i = 0; i < num_params; ++i) {
-    const Dtype param_asum =
-       caffe_cpu_asum(params4[i]->count(), params4[i]->cpu_diff());
-    if (i == 0 || i == 3) {
-      EXPECT_FLOAT_EQ(0, param_asum);
-    } else {
-      EXPECT_FLOAT_EQ(param_asum, param_asums[i]);
-    }
-  }
-}
-
-TYPED_TEST(NetTest, TestFromTo) {
-  typedef typename TypeParam::Dtype Dtype;
-  this->InitTinyNet();
-
-  // Run Forward and Backward, recording the data diff and loss.
-  Blob<Dtype> data;
-  data.ReshapeLike(*this->net_->blob_by_name("data"));
-  this->net_->Forward();
-  this->net_->Backward();
-  data.CopyFrom(*this->net_->blob_by_name("data"), true, true);
-  const Dtype *loss_ptr = this->net_->output_blobs()[0]->cpu_data();
-  Dtype loss = *loss_ptr;
-
-  // Check that combining partial Forwards gives the same loss.
-  for (int i = 1; i < this->net_->layers().size(); ++i) {
-    // Note that we skip layer zero to keep the same data.
-    this->net_->ForwardFromTo(1, 1);
-    if (i < this->net_->layers().size() - 1) {
-      this->net_->ForwardFrom(i + 1);
-    }
-    EXPECT_EQ(loss, *loss_ptr);
-  }
-
-  // Check that combining partial Backwards gives the same data diff.
-  for (int i = 1; i < this->net_->layers().size(); ++i) {
-    this->net_->BackwardTo(i);
-    this->net_->BackwardFrom(i - 1);
-    for (int j = 0; j < data.count(); ++j) {
-      EXPECT_EQ(data.cpu_diff()[j],
-          this->net_->blob_by_name("data")->cpu_diff()[j]);
-    }
-  }
-}
-
-class FilterNetTest : public ::testing::Test {
- protected:
-  void RunFilterNetTest(
-      const string& input_param_string, const string& filtered_param_string) {
-    NetParameter input_param;
-    CHECK(google::protobuf::TextFormat::ParseFromString(
-        input_param_string, &input_param));
-    NetParameter expected_filtered_param;
-    CHECK(google::protobuf::TextFormat::ParseFromString(
-        filtered_param_string, &expected_filtered_param));
-    NetParameter actual_filtered_param;
-    Net<float>::FilterNet(input_param, &actual_filtered_param);
-    EXPECT_EQ(expected_filtered_param.DebugString(),
-        actual_filtered_param.DebugString());
-    // Also test idempotence.
-    NetParameter double_filtered_param;
-    Net<float>::FilterNet(actual_filtered_param, &double_filtered_param);
-    EXPECT_EQ(actual_filtered_param.DebugString(),
-       double_filtered_param.DebugString());
-  }
-};
-
-TEST_F(FilterNetTest, TestNoFilter) {
-  const string& input_proto =
-      "name: 'TestNetwork' "
-      "layer { "
-      "  name: 'data' "
-      "  type: 'Data' "
-      "  top: 'data' "
-      "  top: 'label' "
-      "} "
-      "layer { "
-      "  name: 'innerprod' "
-      "  type: 'InnerProduct' "
-      "  bottom: 'data' "
-      "  top: 'innerprod' "
-      "} "
-      "layer { "
-      "  name: 'loss' "
-      "  type: 'SoftmaxWithLoss' "
-      "  bottom: 'innerprod' "
-      "  bottom: 'label' "
-      "} ";
-  this->RunFilterNetTest(input_proto, input_proto);
-}
-
-TEST_F(FilterNetTest, TestFilterLeNetTrainTest) {
-  const string& input_proto =
-      "name: 'LeNet' "
-      "layer { "
-      "  name: 'mnist' "
-      "  type: 'Data' "
-      "  top: 'data' "
-      "  top: 'label' "
-      "  data_param { "
-      "    source: 'mnist-train-leveldb' "
-      "    batch_size: 64 "
-      "  } "
-      "  transform_param { "
-      "    scale: 0.00390625 "
-      "  } "
-      "  include: { phase: TRAIN } "
-      "} "
-      "layer { "
-      "  name: 'mnist' "
-      "  type: 'Data' "
-      "  top: 'data' "
-      "  top: 'label' "
-      "  data_param { "
-      "    source: 'mnist-test-leveldb' "
-      "    batch_size: 100 "
-      "  } "
-      "  transform_param { "
-      "    scale: 0.00390625 "
-      "  } "
-      "  include: { phase: TEST } "
-      "} "
-      "layer { "
-      "  name: 'conv1' "
-      "  type: 'Convolution' "
-      "  bottom: 'data' "
-      "  top: 'conv1' "
-      "  param { "
-      "    lr_mult: 1 "
-      "  } "
-      "  param { "
-      "    lr_mult: 2 "
-      "  } "
-      "  convolution_param { "
-      "    num_output: 20 "
-      "    kernel_size: 5 "
-      "    stride: 1 "
-      "    weight_filler { "
-      "      type: 'xavier' "
-      "    } "
-      "    bias_filler { "
-      "      type: 'constant' "
-      "    } "
-      "  } "
-      "} "
-      "layer { "
-      "  name: 'ip1' "
-      "  type: 'InnerProduct' "
-      "  bottom: 'conv1' "
-      "  top: 'ip1' "
-      "  param { "
-      "    lr_mult: 1 "
-      "  } "
-      "  param { "
-      "    lr_mult: 2 "
-      "  } "
-      "  inner_product_param { "
-      "    num_output: 10 "
-      "    weight_filler { "
-      "      type: 'xavier' "
-      "    } "
-      "    bias_filler { "
-      "      type: 'constant' "
-      "    } "
-      "  } "
-      "} "
-      "layer { "
-      "  name: 'accuracy' "
-      "  type: 'Accuracy' "
-      "  bottom: 'ip1' "
-      "  bottom: 'label' "
-      "  top: 'accuracy' "
-      "  include: { phase: TEST } "
-      "} "
-      "layer { "
-      "  name: 'loss' "
-      "  type: 'SoftmaxWithLoss' "
-      "  bottom: 'ip2' "
-      "  bottom: 'label' "
-      "  top: 'loss' "
-      "} ";
-  const string input_proto_train = "state: { phase: TRAIN } " + input_proto;
-  const string input_proto_test = "state: { phase: TEST } " + input_proto;
-  const string output_proto_train =
-      "name: 'LeNet' "
-      "layer { "
-      "  name: 'mnist' "
-      "  type: 'Data' "
-      "  top: 'data' "
-      "  top: 'label' "
-      "  data_param { "
-      "    source: 'mnist-train-leveldb' "
-      "    batch_size: 64 "
-      "  } "
-      "  transform_param { "
-      "    scale: 0.00390625 "
-      "  } "
-      "  include: { phase: TRAIN } "
-      "} "
-      "layer { "
-      "  name: 'conv1' "
-      "  type: 'Convolution' "
-      "  bottom: 'data' "
-      "  top: 'conv1' "
-      "  param { "
-      "    lr_mult: 1 "
-      "  } "
-      "  param { "
-      "    lr_mult: 2 "
-      "  } "
-      "  convolution_param { "
-      "    num_output: 20 "
-      "    kernel_size: 5 "
-      "    stride: 1 "
-      "    weight_filler { "
-      "      type: 'xavier' "
-      "    } "
-      "    bias_filler { "
-      "      type: 'constant' "
-      "    } "
-      "  } "
-      "} "
-      "layer { "
-      "  name: 'ip1' "
-      "  type: 'InnerProduct' "
-      "  bottom: 'conv1' "
-      "  top: 'ip1' "
-      "  param { "
-      "    lr_mult: 1 "
-      "  } "
-      "  param { "
-      "    lr_mult: 2 "
-      "  } "
-      "  inner_product_param { "
-      "    num_output: 10 "
-      "    weight_filler { "
-      "      type: 'xavier' "
-      "    } "
-      "    bias_filler { "
-      "      type: 'constant' "
-      "    } "
-      "  } "
-      "} "
-      "layer { "
-      "  name: 'loss' "
-      "  type: 'SoftmaxWithLoss' "
-      "  bottom: 'ip2' "
-      "  bottom: 'label' "
-      "  top: 'loss' "
-      "} ";
-  const string& output_proto_test =
-      "name: 'LeNet' "
-      "layer { "
-      "  name: 'mnist' "
-      "  type: 'Data' "
-      "  top: 'data' "
-      "  top: 'label' "
-      "  data_param { "
-      "    source: 'mnist-test-leveldb' "
-      "    batch_size: 100 "
-      "  } "
-      "  transform_param { "
-      "    scale: 0.00390625 "
-      "  } "
-      "  include: { phase: TEST } "
-      "} "
-      "layer { "
-      "  name: 'conv1' "
-      "  type: 'Convolution' "
-      "  bottom: 'data' "
-      "  top: 'conv1' "
-      "  param { "
-      "    lr_mult: 1 "
-      "  } "
-      "  param { "
-      "    lr_mult: 2 "
-      "  } "
-      "  convolution_param { "
-      "    num_output: 20 "
-      "    kernel_size: 5 "
-      "    stride: 1 "
-      "    weight_filler { "
-      "      type: 'xavier' "
-      "    } "
-      "    bias_filler { "
-      "      type: 'constant' "
-      "    } "
-      "  } "
-      "} "
-      "layer { "
-      "  name: 'ip1' "
-      "  type: 'InnerProduct' "
-      "  bottom: 'conv1' "
-      "  top: 'ip1' "
-      "  param { "
-      "    lr_mult: 1 "
-      "  } "
-      "  param { "
-      "    lr_mult: 2 "
-      "  } "
-      "  inner_product_param { "
-      "    num_output: 10 "
-      "    weight_filler { "
-      "      type: 'xavier' "
-      "    } "
-      "    bias_filler { "
-      "      type: 'constant' "
-      "    } "
-      "  } "
-      "} "
-      "layer { "
-      "  name: 'accuracy' "
-      "  type: 'Accuracy' "
-      "  bottom: 'ip1' "
-      "  bottom: 'label' "
-      "  top: 'accuracy' "
-      "  include: { phase: TEST } "
-      "} "
-      "layer { "
-      "  name: 'loss' "
-      "  type: 'SoftmaxWithLoss' "
-      "  bottom: 'ip2' "
-      "  bottom: 'label' "
-      "  top: 'loss' "
-      "} ";
-  const string output_proto_train_explicit =
-      output_proto_train + " state: { phase: TRAIN } ";
-  const string output_proto_test_explicit =
-      output_proto_test + " state: { phase: TEST } ";
-  this->RunFilterNetTest(input_proto_train, output_proto_train_explicit);
-  this->RunFilterNetTest(input_proto_test, output_proto_test_explicit);
-}
-
-TEST_F(FilterNetTest, TestFilterOutByStage) {
-  const string& input_proto =
-      "name: 'TestNetwork' "
-      "layer { "
-      "  name: 'data' "
-      "  type: 'Data' "
-      "  top: 'data' "
-      "  top: 'label' "
-      "  include: { stage: 'mystage' } "
-      "} "
-      "layer { "
-      "  name: 'innerprod' "
-      "  type: 'InnerProduct' "
-      "  bottom: 'data' "
-      "  top: 'innerprod' "
-      "} "
-      "layer { "
-      "  name: 'loss' "
-      "  type: 'SoftmaxWithLoss' "
-      "  bottom: 'innerprod' "
-      "  bottom: 'label' "
-      "} ";
-  const string& output_proto =
-      "name: 'TestNetwork' "
-      "layer { "
-      "  name: 'innerprod' "
-      "  type: 'InnerProduct' "
-      "  bottom: 'data' "
-      "  top: 'innerprod' "
-      "} "
-      "layer { "
-      "  name: 'loss' "
-      "  type: 'SoftmaxWithLoss' "
-      "  bottom: 'innerprod' "
-      "  bottom: 'label' "
-      "} ";
-  this->RunFilterNetTest(input_proto, output_proto);
-}
-
-TEST_F(FilterNetTest, TestFilterOutByStage2) {
-  const string& input_proto =
-      "name: 'TestNetwork' "
-      "layer { "
-      "  name: 'data' "
-      "  type: 'Data' "
-      "  top: 'data' "
-      "  top: 'label' "
-      "} "
-      "layer { "
-      "  name: 'innerprod' "
-      "  type: 'InnerProduct' "
-      "  bottom: 'data' "
-      "  top: 'innerprod' "
-      "  include: { stage: 'mystage' } "
-      "} "
-      "layer { "
-      "  name: 'loss' "
-      "  type: 'SoftmaxWithLoss' "
-      "  bottom: 'innerprod' "
-      "  bottom: 'label' "
-      "} ";
-  const string& output_proto =
-      "name: 'TestNetwork' "
-      "layer { "
-      "  name: 'data' "
-      "  type: 'Data' "
-      "  top: 'data' "
-      "  top: 'label' "
-      "} "
-      "layer { "
-      "  name: 'loss' "
-      "  type: 'SoftmaxWithLoss' "
-      "  bottom: 'innerprod' "
-      "  bottom: 'label' "
-      "} ";
-  this->RunFilterNetTest(input_proto, output_proto);
-}
-
-TEST_F(FilterNetTest, TestFilterInByStage) {
-  const string& input_proto =
-      "state: { stage: 'mystage' } "
-      "name: 'TestNetwork' "
-      "layer { "
-      "  name: 'data' "
-      "  type: 'Data' "
-      "  top: 'data' "
-      "  top: 'label' "
-      "} "
-      "layer { "
-      "  name: 'innerprod' "
-      "  type: 'InnerProduct' "
-      "  bottom: 'data' "
-      "  top: 'innerprod' "
-      "  include: { stage: 'mystage' } "
-      "} "
-      "layer { "
-      "  name: 'loss' "
-      "  type: 'SoftmaxWithLoss' "
-      "  bottom: 'innerprod' "
-      "  bottom: 'label' "
-      "} ";
-  this->RunFilterNetTest(input_proto, input_proto);
-}
-
-TEST_F(FilterNetTest, TestFilterInByStage2) {
-  const string& input_proto =
-      "name: 'TestNetwork' "
-      "layer { "
-      "  name: 'data' "
-      "  type: 'Data' "
-      "  top: 'data' "
-      "  top: 'label' "
-      "} "
-      "layer { "
-      "  name: 'innerprod' "
-      "  type: 'InnerProduct' "
-      "  bottom: 'data' "
-      "  top: 'innerprod' "
-      "  exclude: { stage: 'mystage' } "
-      "} "
-      "layer { "
-      "  name: 'loss' "
-      "  type: 'SoftmaxWithLoss' "
-      "  bottom: 'innerprod' "
-      "  bottom: 'label' "
-      "} ";
-  this->RunFilterNetTest(input_proto, input_proto);
-}
-
-TEST_F(FilterNetTest, TestFilterOutByMultipleStage) {
-  const string& input_proto =
-      "state: { stage: 'mystage' } "
-      "name: 'TestNetwork' "
-      "layer { "
-      "  name: 'data' "
-      "  type: 'Data' "
-      "  top: 'data' "
-      "  top: 'label' "
-      "} "
-      "layer { "
-      "  name: 'innerprod' "
-      "  type: 'InnerProduct' "
-      "  bottom: 'data' "
-      "  top: 'innerprod' "
-      "  include: { stage: 'mystage' stage: 'myotherstage' } "
-      "} "
-      "layer { "
-      "  name: 'loss' "
-      "  type: 'SoftmaxWithLoss' "
-      "  bottom: 'innerprod' "
-      "  bottom: 'label' "
-      "  include: { stage: 'mystage' } "
-      "} ";
-  const string& output_proto =
-      "state: { stage: 'mystage' } "
-      "name: 'TestNetwork' "
-      "layer { "
-      "  name: 'data' "
-      "  type: 'Data' "
-      "  top: 'data' "
-      "  top: 'label' "
-      "} "
-      "layer { "
-      "  name: 'loss' "
-      "  type: 'SoftmaxWithLoss' "
-      "  bottom: 'innerprod' "
-      "  bottom: 'label' "
-      "  include: { stage: 'mystage' } "
-      "} ";
-  this->RunFilterNetTest(input_proto, output_proto);
-}
-
-TEST_F(FilterNetTest, TestFilterInByMultipleStage) {
-  const string& input_proto =
-      "state: { stage: 'mystage' } "
-      "name: 'TestNetwork' "
-      "layer { "
-      "  name: 'data' "
-      "  type: 'Data' "
-      "  top: 'data' "
-      "  top: 'label' "
-      "} "
-      "layer { "
-      "  name: 'innerprod' "
-      "  type: 'InnerProduct' "
-      "  bottom: 'data' "
-      "  top: 'innerprod' "
-      "  include: { stage: 'myotherstage' } "
-      "  include: { stage: 'mystage' } "
-      "} "
-      "layer { "
-      "  name: 'loss' "
-      "  type: 'SoftmaxWithLoss' "
-      "  bottom: 'innerprod' "
-      "  bottom: 'label' "
-      "  include: { stage: 'mystage' } "
-      "} ";
-  this->RunFilterNetTest(input_proto, input_proto);
-}
-
-TEST_F(FilterNetTest, TestFilterInByMultipleStage2) {
-  const string& input_proto =
-      "state: { stage: 'mystage' stage: 'myotherstage' } "
-      "name: 'TestNetwork' "
-      "layer { "
-      "  name: 'data' "
-      "  type: 'Data' "
-      "  top: 'data' "
-      "  top: 'label' "
-      "} "
-      "layer { "
-      "  name: 'innerprod' "
-      "  type: 'InnerProduct' "
-      "  bottom: 'data' "
-      "  top: 'innerprod' "
-      "  include: { stage: 'mystage' stage: 'myotherstage' } "
-      "} "
-      "layer { "
-      "  name: 'loss' "
-      "  type: 'SoftmaxWithLoss' "
-      "  bottom: 'innerprod' "
-      "  bottom: 'label' "
-      "  include: { stage: 'mystage' } "
-      "} ";
-  this->RunFilterNetTest(input_proto, input_proto);
-}
-
-TEST_F(FilterNetTest, TestFilterInByNotStage) {
-  const string& input_proto =
-      "state: { stage: 'mystage' } "
-      "name: 'TestNetwork' "
-      "layer { "
-      "  name: 'data' "
-      "  type: 'Data' "
-      "  top: 'data' "
-      "  top: 'label' "
-      "} "
-      "layer { "
-      "  name: 'innerprod' "
-      "  type: 'InnerProduct' "
-      "  bottom: 'data' "
-      "  top: 'innerprod' "
-      "  include: { not_stage: 'myotherstage' } "
-      "} "
-      "layer { "
-      "  name: 'loss' "
-      "  type: 'SoftmaxWithLoss' "
-      "  bottom: 'innerprod' "
-      "  bottom: 'label' "
-      "  include: { not_stage: 'myotherstage' } "
-      "} ";
-  this->RunFilterNetTest(input_proto, input_proto);
-}
-
-TEST_F(FilterNetTest, TestFilterOutByNotStage) {
-  const string& input_proto =
-      "state: { stage: 'mystage' } "
-      "name: 'TestNetwork' "
-      "layer { "
-      "  name: 'data' "
-      "  type: 'Data' "
-      "  top: 'data' "
-      "  top: 'label' "
-      "} "
-      "layer { "
-      "  name: 'innerprod' "
-      "  type: 'InnerProduct' "
-      "  bottom: 'data' "
-      "  top: 'innerprod' "
-      "  include: { not_stage: 'mystage' } "
-      "} "
-      "layer { "
-      "  name: 'loss' "
-      "  type: 'SoftmaxWithLoss' "
-      "  bottom: 'innerprod' "
-      "  bottom: 'label' "
-      "  include: { not_stage: 'mystage' } "
-      "} ";
-  const string& output_proto =
-      "state: { stage: 'mystage' } "
-      "name: 'TestNetwork' "
-      "layer { "
-      "  name: 'data' "
-      "  type: 'Data' "
-      "  top: 'data' "
-      "  top: 'label' "
-      "} ";
-  this->RunFilterNetTest(input_proto, output_proto);
-}
-
-TEST_F(FilterNetTest, TestFilterOutByMinLevel) {
-  const string& input_proto =
-      "name: 'TestNetwork' "
-      "layer { "
-      "  name: 'data' "
-      "  type: 'Data' "
-      "  top: 'data' "
-      "  top: 'label' "
-      "} "
-      "layer { "
-      "  name: 'innerprod' "
-      "  type: 'InnerProduct' "
-      "  bottom: 'data' "
-      "  top: 'innerprod' "
-      "  include: { min_level: 3 } "
-      "} "
-      "layer { "
-      "  name: 'loss' "
-      "  type: 'SoftmaxWithLoss' "
-      "  bottom: 'innerprod' "
-      "  bottom: 'label' "
-      "} ";
-  const string& output_proto =
-      "name: 'TestNetwork' "
-      "layer { "
-      "  name: 'data' "
-      "  type: 'Data' "
-      "  top: 'data' "
-      "  top: 'label' "
-      "} "
-      "layer { "
-      "  name: 'loss' "
-      "  type: 'SoftmaxWithLoss' "
-      "  bottom: 'innerprod' "
-      "  bottom: 'label' "
-      "} ";
-  this->RunFilterNetTest(input_proto, output_proto);
-}
-
-TEST_F(FilterNetTest, TestFilterOutByMaxLevel) {
-  const string& input_proto =
-      "name: 'TestNetwork' "
-      "layer { "
-      "  name: 'data' "
-      "  type: 'Data' "
-      "  top: 'data' "
-      "  top: 'label' "
-      "} "
-      "layer { "
-      "  name: 'innerprod' "
-      "  type: 'InnerProduct' "
-      "  bottom: 'data' "
-      "  top: 'innerprod' "
-      "  include: { max_level: -3 } "
-      "} "
-      "layer { "
-      "  name: 'loss' "
-      "  type: 'SoftmaxWithLoss' "
-      "  bottom: 'innerprod' "
-      "  bottom: 'label' "
-      "} ";
-  const string& output_proto =
-      "name: 'TestNetwork' "
-      "layer { "
-      "  name: 'data' "
-      "  type: 'Data' "
-      "  top: 'data' "
-      "  top: 'label' "
-      "} "
-      "layer { "
-      "  name: 'loss' "
-      "  type: 'SoftmaxWithLoss' "
-      "  bottom: 'innerprod' "
-      "  bottom: 'label' "
-      "} ";
-  this->RunFilterNetTest(input_proto, output_proto);
-}
-
-TEST_F(FilterNetTest, TestFilterInByMinLevel) {
-  const string& input_proto =
-      "name: 'TestNetwork' "
-      "layer { "
-      "  name: 'data' "
-      "  type: 'Data' "
-      "  top: 'data' "
-      "  top: 'label' "
-      "} "
-      "layer { "
-      "  name: 'innerprod' "
-      "  type: 'InnerProduct' "
-      "  bottom: 'data' "
-      "  top: 'innerprod' "
-      "  include: { min_level: 0 } "
-      "} "
-      "layer { "
-      "  name: 'loss' "
-      "  type: 'SoftmaxWithLoss' "
-      "  bottom: 'innerprod' "
-      "  bottom: 'label' "
-      "} ";
-  this->RunFilterNetTest(input_proto, input_proto);
-}
-
-TEST_F(FilterNetTest, TestFilterInByMinLevel2) {
-  const string& input_proto =
-      "state: { level: 7 } "
-      "name: 'TestNetwork' "
-      "layer { "
-      "  name: 'data' "
-      "  type: 'Data' "
-      "  top: 'data' "
-      "  top: 'label' "
-      "} "
-      "layer { "
-      "  name: 'innerprod' "
-      "  type: 'InnerProduct' "
-      "  bottom: 'data' "
-      "  top: 'innerprod' "
-      "  include: { min_level: 3 } "
-      "} "
-      "layer { "
-      "  name: 'loss' "
-      "  type: 'SoftmaxWithLoss' "
-      "  bottom: 'innerprod' "
-      "  bottom: 'label' "
-      "} ";
-  this->RunFilterNetTest(input_proto, input_proto);
-}
-
-TEST_F(FilterNetTest, TestFilterInByMaxLevel) {
-  const string& input_proto =
-      "name: 'TestNetwork' "
-      "layer { "
-      "  name: 'data' "
-      "  type: 'Data' "
-      "  top: 'data' "
-      "  top: 'label' "
-      "} "
-      "layer { "
-      "  name: 'innerprod' "
-      "  type: 'InnerProduct' "
-      "  bottom: 'data' "
-      "  top: 'innerprod' "
-      "  include: { max_level: 0 } "
-      "} "
-      "layer { "
-      "  name: 'loss' "
-      "  type: 'SoftmaxWithLoss' "
-      "  bottom: 'innerprod' "
-      "  bottom: 'label' "
-      "} ";
-  this->RunFilterNetTest(input_proto, input_proto);
-}
-
-TEST_F(FilterNetTest, TestFilterInByMaxLevel2) {
-  const string& input_proto =
-      "state: { level: -7 } "
-      "name: 'TestNetwork' "
-      "layer { "
-      "  name: 'data' "
-      "  type: 'Data' "
-      "  top: 'data' "
-      "  top: 'label' "
-      "} "
-      "layer { "
-      "  name: 'innerprod' "
-      "  type: 'InnerProduct' "
-      "  bottom: 'data' "
-      "  top: 'innerprod' "
-      "  include: { max_level: -3 } "
-      "} "
-      "layer { "
-      "  name: 'loss' "
-      "  type: 'SoftmaxWithLoss' "
-      "  bottom: 'innerprod' "
-      "  bottom: 'label' "
-      "} ";
-  this->RunFilterNetTest(input_proto, input_proto);
-}
-
-TEST_F(FilterNetTest, TestFilterInOutByIncludeMultiRule) {
-  const string& input_proto =
-      "name: 'TestNetwork' "
-      "layer { "
-      "  name: 'data' "
-      "  type: 'Data' "
-      "  top: 'data' "
-      "  top: 'label' "
-      "} "
-      "layer { "
-      "  name: 'innerprod' "
-      "  type: 'InnerProduct' "
-      "  bottom: 'data' "
-      "  top: 'innerprod' "
-      "  include: { min_level: 2  phase: TRAIN } "
-      "} "
-      "layer { "
-      "  name: 'loss' "
-      "  type: 'SoftmaxWithLoss' "
-      "  bottom: 'innerprod' "
-      "  bottom: 'label' "
-      "  include: { min_level: 2  phase: TEST } "
-      "} ";
-  const string& input_proto_train =
-      "state: { level: 4  phase: TRAIN } " + input_proto;
-  const string& input_proto_test =
-      "state: { level: 4  phase: TEST } " + input_proto;
-  const string& output_proto_train =
-      "state: { level: 4  phase: TRAIN } "
-      "name: 'TestNetwork' "
-      "layer { "
-      "  name: 'data' "
-      "  type: 'Data' "
-      "  top: 'data' "
-      "  top: 'label' "
-      "} "
-      "layer { "
-      "  name: 'innerprod' "
-      "  type: 'InnerProduct' "
-      "  bottom: 'data' "
-      "  top: 'innerprod' "
-      "  include: { min_level: 2  phase: TRAIN } "
-      "} ";
-  const string& output_proto_test =
-      "state: { level: 4  phase: TEST } "
-      "name: 'TestNetwork' "
-      "layer { "
-      "  name: 'data' "
-      "  type: 'Data' "
-      "  top: 'data' "
-      "  top: 'label' "
-      "} "
-      "layer { "
-      "  name: 'loss' "
-      "  type: 'SoftmaxWithLoss' "
-      "  bottom: 'innerprod' "
-      "  bottom: 'label' "
-      "  include: { min_level: 2  phase: TEST } "
-      "} ";
-  this->RunFilterNetTest(input_proto_train, output_proto_train);
-  this->RunFilterNetTest(input_proto_test, output_proto_test);
-}
-
-TEST_F(FilterNetTest, TestFilterInByIncludeMultiRule) {
-  const string& input_proto =
-      "name: 'TestNetwork' "
-      "layer { "
-      "  name: 'data' "
-      "  type: 'Data' "
-      "  top: 'data' "
-      "  top: 'label' "
-      "} "
-      "layer { "
-      "  name: 'innerprod' "
-      "  type: 'InnerProduct' "
-      "  bottom: 'data' "
-      "  top: 'innerprod' "
-      "  include: { min_level: 2  phase: TRAIN } "
-      "  include: { phase: TEST } "
-      "} "
-      "layer { "
-      "  name: 'loss' "
-      "  type: 'SoftmaxWithLoss' "
-      "  bottom: 'innerprod' "
-      "  bottom: 'label' "
-      "  include: { min_level: 2  phase: TEST } "
-      "  include: { phase: TRAIN } "
-      "} ";
-  const string& input_proto_train =
-      "state: { level: 2  phase: TRAIN } " + input_proto;
-  const string& input_proto_test =
-      "state: { level: 2  phase: TEST } " + input_proto;
-  this->RunFilterNetTest(input_proto_train, input_proto_train);
-  this->RunFilterNetTest(input_proto_test, input_proto_test);
-}
-
-TEST_F(FilterNetTest, TestFilterInOutByExcludeMultiRule) {
-  const string& input_proto =
-      "name: 'TestNetwork' "
-      "layer { "
-      "  name: 'data' "
-      "  type: 'Data' "
-      "  top: 'data' "
-      "  top: 'label' "
-      "} "
-      "layer { "
-      "  name: 'innerprod' "
-      "  type: 'InnerProduct' "
-      "  bottom: 'data' "
-      "  top: 'innerprod' "
-      "  exclude: { min_level: 2  phase: TRAIN } "
-      "} "
-      "layer { "
-      "  name: 'loss' "
-      "  type: 'SoftmaxWithLoss' "
-      "  bottom: 'innerprod' "
-      "  bottom: 'label' "
-      "  exclude: { min_level: 2  phase: TEST } "
-      "} ";
-  const string& input_proto_train =
-      "state: { level: 4  phase: TRAIN } " + input_proto;
-  const string& input_proto_test =
-      "state: { level: 4  phase: TEST } " + input_proto;
-  const string& output_proto_train =
-      "state: { level: 4  phase: TRAIN } "
-      "name: 'TestNetwork' "
-      "layer { "
-      "  name: 'data' "
-      "  type: 'Data' "
-      "  top: 'data' "
-      "  top: 'label' "
-      "} "
-      "layer { "
-      "  name: 'loss' "
-      "  type: 'SoftmaxWithLoss' "
-      "  bottom: 'innerprod' "
-      "  bottom: 'label' "
-      "  exclude: { min_level: 2  phase: TEST } "
-      "} ";
-  const string& output_proto_test =
-      "state: { level: 4  phase: TEST } "
-      "name: 'TestNetwork' "
-      "layer { "
-      "  name: 'data' "
-      "  type: 'Data' "
-      "  top: 'data' "
-      "  top: 'label' "
-      "} "
-      "layer { "
-      "  name: 'innerprod' "
-      "  type: 'InnerProduct' "
-      "  bottom: 'data' "
-      "  top: 'innerprod' "
-      "  exclude: { min_level: 2  phase: TRAIN } "
-      "} ";
-  this->RunFilterNetTest(input_proto_train, output_proto_train);
-  this->RunFilterNetTest(input_proto_test, output_proto_test);
-}
-
-TYPED_TEST(NetTest, TestReshape) {
-  typedef typename TypeParam::Dtype Dtype;
-  // We set up bottom blobs of two different sizes, switch between
-  // them, check that forward and backward both run and the results
-  // are the same, and check that the output shapes change.
-  Caffe::set_random_seed(this->seed_);
-  Caffe::set_mode(Caffe::CPU);
-  FillerParameter filler_param;
-  filler_param.set_std(1);
-  GaussianFiller<Dtype> filler(filler_param);
-  // Check smaller shape first as larger first could hide realloc failures.
-  Blob<Dtype> blob1(2, 3, 12, 10);
-  Blob<Dtype> blob2(4, 3, 9, 11);
-  ASSERT_LT(blob1.count(), blob2.count());
-  filler.Fill(&blob1);
-  filler.Fill(&blob2);
-
-  this->InitReshapableNet();
-  shared_ptr<Blob<Dtype> > input_blob = this->net_->blob_by_name("data");
-  Blob<Dtype>* output_blob = this->net_->output_blobs()[0];
-  input_blob->Reshape(blob1.num(), blob1.channels(), blob1.height(),
-      blob1.width());
-  caffe_copy(blob1.count(), blob1.cpu_data(), input_blob->mutable_cpu_data());
-  this->net_->Forward();
-  // call backward just to make sure it runs
-  this->net_->Backward();
-  Blob<Dtype> output1(output_blob->num(), output_blob->channels(),
-      output_blob->height(), output_blob->width());
-  caffe_copy(output1.count(), output_blob->cpu_data(),
-      output1.mutable_cpu_data());
-
-  input_blob->Reshape(blob2.num(), blob2.channels(), blob2.height(),
-      blob2.width());
-  caffe_copy(blob2.count(), blob2.cpu_data(), input_blob->mutable_cpu_data());
-  this->net_->Forward();
-  this->net_->Backward();
-  Blob<Dtype> output2(output_blob->num(), output_blob->channels(),
-      output_blob->height(), output_blob->width());
-  caffe_copy(output2.count(), output_blob->cpu_data(),
-      output2.mutable_cpu_data());
-
-  input_blob->Reshape(blob1.num(), blob1.channels(), blob1.height(),
-      blob1.width());
-  caffe_copy(blob1.count(), blob1.cpu_data(), input_blob->mutable_cpu_data());
-  this->net_->Forward();
-  this->net_->Backward();
-  for (int i = 0; i < output1.count(); ++i) {
-    EXPECT_FLOAT_EQ(*(output1.cpu_data() + i), *(output_blob->cpu_data() + i));
-  }
-
-  input_blob->Reshape(blob2.num(), blob2.channels(), blob2.height(),
-      blob2.width());
-  caffe_copy(blob2.count(), blob2.cpu_data(), input_blob->mutable_cpu_data());
-  this->net_->Forward();
-  this->net_->Backward();
-  for (int i = 0; i < output2.count(); ++i) {
-    EXPECT_FLOAT_EQ(*(output2.cpu_data() + i), *(output_blob->cpu_data() + i));
-  }
-
-  EXPECT_EQ(output1.num(), blob1.num());
-  EXPECT_EQ(output2.num(), blob2.num());
-  bool same_spatial_shape = true;
-  const int kFirstSpatialAxis = 2;
-  for (int i = kFirstSpatialAxis; i < output1.num_axes(); ++i) {
-    if (output1.shape(i) != output2.shape(i)) {
-      same_spatial_shape = false;
-      break;
-    }
-  }
-  EXPECT_FALSE(same_spatial_shape);
-}
-
-TYPED_TEST(NetTest, TestSkipPropagateDown) {
-  // check bottom_need_backward if propagate_down is true
-  this->InitSkipPropNet(false);
-  vector<bool> vec_layer_need_backward = this->net_->layer_need_backward();
-  for (int layer_id = 0; layer_id < this->net_->layers().size(); ++layer_id) {
-    string layer_name = this->net_->layer_names()[layer_id];
-    if (layer_name == "loss") {
-      // access to bottom_need_backward coresponding to label's blob
-      bool need_back = this->net_->bottom_need_backward()[layer_id][1];
-      // if propagate_down is true, the loss layer will try to
-      // backpropagate on labels
-      EXPECT_TRUE(need_back) << "bottom_need_backward should be True";
-    }
-    // layer_need_backward should be True except for data and silence layers
-    if (layer_name.find("data") != std::string::npos ||
-          layer_name == "silence") {
-      EXPECT_FALSE(vec_layer_need_backward[layer_id])
-          << "layer_need_backward for " << layer_name << " should be False";
-    } else {
-      EXPECT_TRUE(vec_layer_need_backward[layer_id])
-          << "layer_need_backward for " << layer_name << " should be True";
-    }
-  }
-  // check bottom_need_backward if propagat_down is false
-  this->InitSkipPropNet(true);
-  vec_layer_need_backward.clear();
-  vec_layer_need_backward = this->net_->layer_need_backward();
-  for (int layer_id = 0; layer_id < this->net_->layers().size(); ++layer_id) {
-    string layer_name = this->net_->layer_names()[layer_id];
-    if (layer_name == "loss") {
-      // access to bottom_need_backward coresponding to label's blob
-      bool need_back = this->net_->bottom_need_backward()[layer_id][1];
-      // if propagate_down is false, the loss layer will not try to
-      // backpropagate on labels
-      EXPECT_FALSE(need_back) << "bottom_need_backward should be False";
-    }
-    // layer_need_backward should be False except for innerproduct and
-    // loss layers
-    if (layer_name == "innerproduct" || layer_name == "loss") {
-      EXPECT_TRUE(vec_layer_need_backward[layer_id])
-          << "layer_need_backward for " << layer_name << " should be True";
-    } else {
-      EXPECT_FALSE(vec_layer_need_backward[layer_id])
-          << "layer_need_backward for " << layer_name << " should be False";
-    }
-  }
-}
-
-TYPED_TEST(NetTest, TestForcePropagateDown) {
-  this->InitForcePropNet(false);
-  vector<bool> layer_need_backward = this->net_->layer_need_backward();
-  for (int layer_id = 0; layer_id < this->net_->layers().size(); ++layer_id) {
-    const string& layer_name = this->net_->layer_names()[layer_id];
-    const vector<bool> need_backward =
-        this->net_->bottom_need_backward()[layer_id];
-    if (layer_name == "data") {
-      ASSERT_EQ(need_backward.size(), 0);
-      EXPECT_FALSE(layer_need_backward[layer_id]);
-    } else if (layer_name == "innerproduct") {
-      ASSERT_EQ(need_backward.size(), 1);
-      EXPECT_FALSE(need_backward[0]);  // data
-      EXPECT_TRUE(layer_need_backward[layer_id]);
-    } else if (layer_name == "loss") {
-      ASSERT_EQ(need_backward.size(), 2);
-      EXPECT_TRUE(need_backward[0]);   // innerproduct
-      EXPECT_FALSE(need_backward[1]);  // label
-      EXPECT_TRUE(layer_need_backward[layer_id]);
-    } else {
-      LOG(FATAL) << "Unknown layer: " << layer_name;
-    }
-  }
-  this->InitForcePropNet(true);
-  layer_need_backward = this->net_->layer_need_backward();
-  for (int layer_id = 0; layer_id < this->net_->layers().size(); ++layer_id) {
-    const string& layer_name = this->net_->layer_names()[layer_id];
-    const vector<bool> need_backward =
-        this->net_->bottom_need_backward()[layer_id];
-    if (layer_name == "data") {
-      ASSERT_EQ(need_backward.size(), 0);
-      EXPECT_FALSE(layer_need_backward[layer_id]);
-    } else if (layer_name == "innerproduct") {
-      ASSERT_EQ(need_backward.size(), 1);
-      EXPECT_TRUE(need_backward[0]);  // data
-      EXPECT_TRUE(layer_need_backward[layer_id]);
-    } else if (layer_name == "loss") {
-      ASSERT_EQ(need_backward.size(), 2);
-      EXPECT_TRUE(need_backward[0]);   // innerproduct
-      EXPECT_FALSE(need_backward[1]);  // label
-      EXPECT_TRUE(layer_need_backward[layer_id]);
-    } else {
-      LOG(FATAL) << "Unknown layer: " << layer_name;
-    }
-  }
-}
-
-}  // namespace caffe
diff --git a/src/caffe/test/test_neuron_layer.cpp b/src/caffe/test/test_neuron_layer.cpp
deleted file mode 100644
index 342f825..0000000
--- a/src/caffe/test/test_neuron_layer.cpp
+++ /dev/null
@@ -1,938 +0,0 @@
-#include <algorithm>
-#include <vector>
-
-#include "google/protobuf/text_format.h"
-#include "gtest/gtest.h"
-
-#include "caffe/blob.hpp"
-#include "caffe/common.hpp"
-#include "caffe/filler.hpp"
-
-#include "caffe/layers/absval_layer.hpp"
-#include "caffe/layers/bnll_layer.hpp"
-#include "caffe/layers/dropout_layer.hpp"
-#include "caffe/layers/elu_layer.hpp"
-#include "caffe/layers/exp_layer.hpp"
-#include "caffe/layers/inner_product_layer.hpp"
-#include "caffe/layers/log_layer.hpp"
-#include "caffe/layers/power_layer.hpp"
-#include "caffe/layers/prelu_layer.hpp"
-#include "caffe/layers/relu_layer.hpp"
-#include "caffe/layers/sigmoid_layer.hpp"
-#include "caffe/layers/tanh_layer.hpp"
-#include "caffe/layers/threshold_layer.hpp"
-
-#ifdef USE_CUDNN
-#include "caffe/layers/cudnn_relu_layer.hpp"
-#include "caffe/layers/cudnn_sigmoid_layer.hpp"
-#include "caffe/layers/cudnn_tanh_layer.hpp"
-#endif
-
-#include "caffe/test/test_caffe_main.hpp"
-#include "caffe/test/test_gradient_check_util.hpp"
-
-namespace caffe {
-
-template <typename TypeParam>
-class NeuronLayerTest : public MultiDeviceTest<TypeParam> {
-  typedef typename TypeParam::Dtype Dtype;
-
- protected:
-  NeuronLayerTest()
-      : blob_bottom_(new Blob<Dtype>(2, 3, 4, 5)),
-        blob_top_(new Blob<Dtype>()) {
-    Caffe::set_random_seed(1701);
-    // fill the values
-    FillerParameter filler_param;
-    GaussianFiller<Dtype> filler(filler_param);
-    filler.Fill(this->blob_bottom_);
-    blob_bottom_vec_.push_back(blob_bottom_);
-    blob_top_vec_.push_back(blob_top_);
-  }
-  virtual ~NeuronLayerTest() { delete blob_bottom_; delete blob_top_; }
-  Blob<Dtype>* const blob_bottom_;
-  Blob<Dtype>* const blob_top_;
-  vector<Blob<Dtype>*> blob_bottom_vec_;
-  vector<Blob<Dtype>*> blob_top_vec_;
-
-  void TestDropoutForward(const float dropout_ratio) {
-    LayerParameter layer_param;
-    // Fill in the given dropout_ratio, unless it's 0.5, in which case we don't
-    // set it explicitly to test that 0.5 is the default.
-    if (dropout_ratio != 0.5) {
-      layer_param.mutable_dropout_param()->set_dropout_ratio(dropout_ratio);
-    }
-    DropoutLayer<Dtype> layer(layer_param);
-    layer_param.set_phase(TRAIN);
-    layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-    layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-    // Now, check values
-    const Dtype* bottom_data = this->blob_bottom_->cpu_data();
-    const Dtype* top_data = this->blob_top_->cpu_data();
-    float scale = 1. / (1. - layer_param.dropout_param().dropout_ratio());
-    const int count = this->blob_bottom_->count();
-    // Initialize num_kept to count the number of inputs NOT dropped out.
-    int num_kept = 0;
-    for (int i = 0; i < count; ++i) {
-      if (top_data[i] != 0) {
-        ++num_kept;
-        EXPECT_EQ(top_data[i], bottom_data[i] * scale);
-      }
-    }
-    const Dtype std_error = sqrt(dropout_ratio * (1 - dropout_ratio) / count);
-    // Fail if the number dropped was more than 1.96 * std_error away from the
-    // expected number -- requires 95% confidence that the dropout layer is not
-    // obeying the given dropout_ratio for test failure.
-    const Dtype empirical_dropout_ratio = 1 - num_kept / Dtype(count);
-    EXPECT_NEAR(empirical_dropout_ratio, dropout_ratio, 1.96 * std_error);
-  }
-
-  void TestExpForward(const float base, const float scale, const float shift) {
-    LayerParameter layer_param;
-    layer_param.mutable_exp_param()->set_base(base);
-    layer_param.mutable_exp_param()->set_scale(scale);
-    layer_param.mutable_exp_param()->set_shift(shift);
-    ExpLayer<Dtype> layer(layer_param);
-    layer.SetUp(blob_bottom_vec_, blob_top_vec_);
-    layer.Forward(blob_bottom_vec_, blob_top_vec_);
-    const Dtype kDelta = 2e-4;
-    const Dtype* bottom_data = blob_bottom_->cpu_data();
-    const Dtype* top_data = blob_top_->cpu_data();
-    for (int i = 0; i < blob_bottom_->count(); ++i) {
-      const Dtype bottom_val = bottom_data[i];
-      const Dtype top_val = top_data[i];
-      if (base == -1) {
-        EXPECT_NEAR(top_val, exp(shift + scale * bottom_val), kDelta);
-      } else {
-        EXPECT_NEAR(top_val, pow(base, shift + scale * bottom_val), kDelta);
-      }
-    }
-  }
-
-  void TestExpGradient(const float base, const float scale, const float shift) {
-    LayerParameter layer_param;
-    layer_param.mutable_exp_param()->set_base(base);
-    layer_param.mutable_exp_param()->set_scale(scale);
-    layer_param.mutable_exp_param()->set_shift(shift);
-    ExpLayer<Dtype> layer(layer_param);
-    GradientChecker<Dtype> checker(1e-2, 1e-3);
-    checker.CheckGradientEltwise(&layer, blob_bottom_vec_, blob_top_vec_);
-  }
-
-  void TestPReLU(PReLULayer<Dtype> *layer) {
-    layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  // Now, check values
-    const Dtype* bottom_data = this->blob_bottom_->cpu_data();
-    const Dtype* top_data = this->blob_top_->cpu_data();
-    const Dtype* slope_data = layer->blobs()[0]->cpu_data();
-    int hw = this->blob_bottom_->height() * this->blob_bottom_->width();
-    int channels = this->blob_bottom_->channels();
-    bool channel_shared = layer->layer_param().prelu_param().channel_shared();
-    for (int i = 0; i < this->blob_bottom_->count(); ++i) {
-      int c = channel_shared ? 0 : (i / hw) % channels;
-      EXPECT_EQ(top_data[i],
-          std::max(bottom_data[i], (Dtype)(0))
-          + slope_data[c] * std::min(bottom_data[i], (Dtype)(0)));
-    }
-  }
-
-  void LogBottomInit() {
-    FillerParameter filler_param;
-    GaussianFiller<Dtype> filler(filler_param);
-    filler.Fill(this->blob_bottom_);
-    Dtype* bottom_data = this->blob_bottom_->mutable_cpu_data();
-    caffe_exp(this->blob_bottom_->count(), bottom_data, bottom_data);
-  }
-
-  void TestLogForward(const float base, const float scale, const float shift) {
-    LogBottomInit();
-    LayerParameter layer_param;
-    layer_param.mutable_log_param()->set_base(base);
-    layer_param.mutable_log_param()->set_scale(scale);
-    layer_param.mutable_log_param()->set_shift(shift);
-    LogLayer<Dtype> layer(layer_param);
-    layer.SetUp(blob_bottom_vec_, blob_top_vec_);
-    layer.Forward(blob_bottom_vec_, blob_top_vec_);
-    const Dtype kDelta = 2e-4;
-    const Dtype* bottom_data = blob_bottom_->cpu_data();
-    const Dtype* top_data = blob_top_->cpu_data();
-    for (int i = 0; i < blob_bottom_->count(); ++i) {
-      const Dtype bottom_val = bottom_data[i];
-      const Dtype top_val = top_data[i];
-      if (base == -1) {
-        EXPECT_NEAR(top_val, log(shift + scale * bottom_val), kDelta);
-      } else {
-        EXPECT_NEAR(top_val, log(shift + scale * bottom_val) / log(base),
-                    kDelta);
-      }
-    }
-  }
-
-  void TestLogGradient(const float base, const float scale, const float shift) {
-    LogBottomInit();
-    LayerParameter layer_param;
-    layer_param.mutable_log_param()->set_base(base);
-    layer_param.mutable_log_param()->set_scale(scale);
-    layer_param.mutable_log_param()->set_shift(shift);
-    LogLayer<Dtype> layer(layer_param);
-    GradientChecker<Dtype> checker(1e-2, 1e-2);
-    checker.CheckGradientEltwise(&layer, blob_bottom_vec_, blob_top_vec_);
-  }
-};
-
-TYPED_TEST_CASE(NeuronLayerTest, TestDtypesAndDevices);
-
-TYPED_TEST(NeuronLayerTest, TestAbsVal) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  AbsValLayer<Dtype> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  const Dtype* bottom_data = this->blob_bottom_->cpu_data();
-  const Dtype* top_data    = this->blob_top_->cpu_data();
-  const int count = this->blob_bottom_->count();
-  for (int i = 0; i < count; ++i) {
-    EXPECT_EQ(top_data[i], fabs(bottom_data[i]));
-  }
-}
-
-TYPED_TEST(NeuronLayerTest, TestAbsGradient) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  AbsValLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-2, 1e-3, 1701, 0., 0.01);
-  checker.CheckGradientEltwise(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_);
-}
-
-TYPED_TEST(NeuronLayerTest, TestReLU) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  ReLULayer<Dtype> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  // Now, check values
-  const Dtype* bottom_data = this->blob_bottom_->cpu_data();
-  const Dtype* top_data = this->blob_top_->cpu_data();
-  for (int i = 0; i < this->blob_bottom_->count(); ++i) {
-    EXPECT_GE(top_data[i], 0.);
-    EXPECT_TRUE(top_data[i] == 0 || top_data[i] == bottom_data[i]);
-  }
-}
-
-TYPED_TEST(NeuronLayerTest, TestReLUGradient) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  ReLULayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-2, 1e-3, 1701, 0., 0.01);
-  checker.CheckGradientEltwise(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_);
-}
-
-TYPED_TEST(NeuronLayerTest, TestReLUWithNegativeSlope) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  CHECK(google::protobuf::TextFormat::ParseFromString(
-      "relu_param { negative_slope: 0.01 }", &layer_param));
-  ReLULayer<Dtype> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  // Now, check values
-  const Dtype* bottom_data = this->blob_bottom_->cpu_data();
-  const Dtype* top_data = this->blob_top_->cpu_data();
-  for (int i = 0; i < this->blob_bottom_->count(); ++i) {
-    if (top_data[i] >= 0) {
-      EXPECT_FLOAT_EQ(top_data[i], bottom_data[i]);
-    } else {
-      EXPECT_FLOAT_EQ(top_data[i], bottom_data[i] * 0.01);
-    }
-  }
-}
-
-TYPED_TEST(NeuronLayerTest, TestReLUGradientWithNegativeSlope) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  CHECK(google::protobuf::TextFormat::ParseFromString(
-      "relu_param { negative_slope: 0.01 }", &layer_param));
-  ReLULayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-2, 1e-3, 1701, 0., 0.01);
-  checker.CheckGradientEltwise(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_);
-}
-
-TYPED_TEST(NeuronLayerTest, TestELU) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  CHECK(google::protobuf::TextFormat::ParseFromString(
-      "elu_param { alpha: 0.5 }", &layer_param));
-  ELULayer<Dtype> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  const Dtype kDelta = 2e-4;
-  // Now, check values
-  const Dtype* bottom_data = this->blob_bottom_->cpu_data();
-  const Dtype* top_data = this->blob_top_->cpu_data();
-  for (int i = 0; i < this->blob_bottom_->count(); ++i) {
-    if (bottom_data[i] > 0) {
-      EXPECT_FLOAT_EQ(top_data[i], bottom_data[i]);
-    } else {
-      EXPECT_NEAR(top_data[i], 0.5 * (exp(bottom_data[i]) - 1), kDelta);
-    }
-  }
-}
-
-TYPED_TEST(NeuronLayerTest, TestELUasReLU) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  CHECK(google::protobuf::TextFormat::ParseFromString(
-      "elu_param { alpha: 0 }", &layer_param));
-  ELULayer<Dtype> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  // Now, check values
-  const Dtype* bottom_data = this->blob_bottom_->cpu_data();
-  const Dtype* top_data = this->blob_top_->cpu_data();
-  for (int i = 0; i < this->blob_bottom_->count(); ++i) {
-    EXPECT_GE(top_data[i], 0.);
-    EXPECT_TRUE(top_data[i] == 0 || top_data[i] == bottom_data[i]);
-  }
-}
-
-TYPED_TEST(NeuronLayerTest, TestELUGradient) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  ELULayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-2, 1e-3, 1701, 0., 0.01);
-  checker.CheckGradientEltwise(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_);
-}
-
-TYPED_TEST(NeuronLayerTest, TestELUasReLUGradient) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  CHECK(google::protobuf::TextFormat::ParseFromString(
-      "elu_param { alpha: 0 }", &layer_param));
-  ELULayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-2, 1e-3, 1701, 0., 0.01);
-  checker.CheckGradientEltwise(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_);
-}
-
-TYPED_TEST(NeuronLayerTest, TestSigmoid) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  SigmoidLayer<Dtype> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  // Now, check values
-  const Dtype* bottom_data = this->blob_bottom_->cpu_data();
-  const Dtype* top_data = this->blob_top_->cpu_data();
-  for (int i = 0; i < this->blob_bottom_->count(); ++i) {
-    EXPECT_FLOAT_EQ(top_data[i], 1. / (1 + exp(-bottom_data[i])));
-    // check that we squashed the value between 0 and 1
-    EXPECT_GE(top_data[i], 0.);
-    EXPECT_LE(top_data[i], 1.);
-  }
-}
-
-TYPED_TEST(NeuronLayerTest, TestSigmoidGradient) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  SigmoidLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-2, 1e-3, 1701, 0., 0.01);
-  checker.CheckGradientEltwise(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_);
-}
-
-TYPED_TEST(NeuronLayerTest, TestTanH) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  TanHLayer<Dtype> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  // Test exact values
-  for (int i = 0; i < this->blob_bottom_->num(); ++i) {
-    for (int j = 0; j < this->blob_bottom_->channels(); ++j) {
-      for (int k = 0; k < this->blob_bottom_->height(); ++k) {
-        for (int l = 0; l < this->blob_bottom_->width(); ++l) {
-          EXPECT_GE(this->blob_top_->data_at(i, j, k, l) + 1e-4,
-             (exp(2*this->blob_bottom_->data_at(i, j, k, l)) - 1) /
-             (exp(2*this->blob_bottom_->data_at(i, j, k, l)) + 1));
-          EXPECT_LE(this->blob_top_->data_at(i, j, k, l) - 1e-4,
-             (exp(2*this->blob_bottom_->data_at(i, j, k, l)) - 1) /
-             (exp(2*this->blob_bottom_->data_at(i, j, k, l)) + 1));
-        }
-      }
-    }
-  }
-}
-
-TYPED_TEST(NeuronLayerTest, TestTanHGradient) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  TanHLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-2, 1e-3);
-  checker.CheckGradientEltwise(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_);
-}
-
-TYPED_TEST(NeuronLayerTest, TestExpLayer) {
-  typedef typename TypeParam::Dtype Dtype;
-  // Test default base of "-1" -- should actually set base := e.
-  const Dtype kBase = -1;
-  const Dtype kScale = 1;
-  const Dtype kShift = 0;
-  this->TestExpForward(kBase, kScale, kShift);
-}
-
-TYPED_TEST(NeuronLayerTest, TestExpGradient) {
-  typedef typename TypeParam::Dtype Dtype;
-  // Test default base of "-1" -- should actually set base := e.
-  const Dtype kBase = -1;
-  const Dtype kScale = 1;
-  const Dtype kShift = 0;
-  this->TestExpGradient(kBase, kScale, kShift);
-}
-
-TYPED_TEST(NeuronLayerTest, TestExpLayerWithShift) {
-  typedef typename TypeParam::Dtype Dtype;
-  // Test default base of "-1" -- should actually set base := e,
-  // with a non-zero shift
-  const Dtype kBase = -1;
-  const Dtype kScale = 1;
-  const Dtype kShift = 1;
-  this->TestExpForward(kBase, kScale, kShift);
-}
-
-TYPED_TEST(NeuronLayerTest, TestExpGradientWithShift) {
-  typedef typename TypeParam::Dtype Dtype;
-  // Test default base of "-1" -- should actually set base := e,
-  // with a non-zero shift
-  const Dtype kBase = -1;
-  const Dtype kScale = 1;
-  const Dtype kShift = 1;
-  this->TestExpGradient(kBase, kScale, kShift);
-}
-
-TYPED_TEST(NeuronLayerTest, TestExpLayerBase2) {
-  typedef typename TypeParam::Dtype Dtype;
-  const Dtype kBase = 2;
-  const Dtype kScale = 1;
-  const Dtype kShift = 0;
-  this->TestExpForward(kBase, kScale, kShift);
-}
-
-TYPED_TEST(NeuronLayerTest, TestExpGradientBase2) {
-  typedef typename TypeParam::Dtype Dtype;
-  const Dtype kBase = 2;
-  const Dtype kScale = 1;
-  const Dtype kShift = 0;
-  this->TestExpGradient(kBase, kScale, kShift);
-}
-
-TYPED_TEST(NeuronLayerTest, TestExpLayerBase2Shift1) {
-  typedef typename TypeParam::Dtype Dtype;
-  const Dtype kBase = 2;
-  const Dtype kScale = 1;
-  const Dtype kShift = 1;
-  this->TestExpForward(kBase, kScale, kShift);
-}
-
-TYPED_TEST(NeuronLayerTest, TestExpGradientBase2Shift1) {
-  typedef typename TypeParam::Dtype Dtype;
-  const Dtype kBase = 2;
-  const Dtype kScale = 1;
-  const Dtype kShift = 1;
-  this->TestExpGradient(kBase, kScale, kShift);
-}
-
-TYPED_TEST(NeuronLayerTest, TestExpLayerBase2Scale3) {
-  typedef typename TypeParam::Dtype Dtype;
-  const Dtype kBase = 2;
-  const Dtype kScale = 3;
-  const Dtype kShift = 0;
-  this->TestExpForward(kBase, kScale, kShift);
-}
-
-TYPED_TEST(NeuronLayerTest, TestExpGradientBase2Scale3) {
-  typedef typename TypeParam::Dtype Dtype;
-  const Dtype kBase = 2;
-  const Dtype kScale = 3;
-  const Dtype kShift = 0;
-  this->TestExpGradient(kBase, kScale, kShift);
-}
-
-TYPED_TEST(NeuronLayerTest, TestExpLayerBase2Shift1Scale3) {
-  typedef typename TypeParam::Dtype Dtype;
-  const Dtype kBase = 2;
-  const Dtype kScale = 3;
-  const Dtype kShift = 1;
-  this->TestExpForward(kBase, kScale, kShift);
-}
-
-TYPED_TEST(NeuronLayerTest, TestExpGradientBase2Shift1Scale3) {
-  typedef typename TypeParam::Dtype Dtype;
-  const Dtype kBase = 2;
-  const Dtype kScale = 3;
-  const Dtype kShift = 1;
-  this->TestExpGradient(kBase, kScale, kShift);
-}
-
-TYPED_TEST(NeuronLayerTest, TestLogLayer) {
-  typedef typename TypeParam::Dtype Dtype;
-  // Test default base of "-1" -- should actually set base := e.
-  const Dtype kBase = -1;
-  const Dtype kScale = 1;
-  const Dtype kShift = 0;
-  this->TestLogForward(kBase, kScale, kShift);
-}
-
-TYPED_TEST(NeuronLayerTest, TestLogGradient) {
-  typedef typename TypeParam::Dtype Dtype;
-  // Test default base of "-1" -- should actually set base := e.
-  const Dtype kBase = -1;
-  const Dtype kScale = 1;
-  const Dtype kShift = 0;
-  this->TestLogGradient(kBase, kScale, kShift);
-}
-
-TYPED_TEST(NeuronLayerTest, TestLogLayerBase2) {
-  typedef typename TypeParam::Dtype Dtype;
-  const Dtype kBase = 2;
-  const Dtype kScale = 1;
-  const Dtype kShift = 0;
-  this->TestLogForward(kBase, kScale, kShift);
-}
-
-TYPED_TEST(NeuronLayerTest, TestLogGradientBase2) {
-  typedef typename TypeParam::Dtype Dtype;
-  const Dtype kBase = 2;
-  const Dtype kScale = 1;
-  const Dtype kShift = 0;
-  this->TestLogGradient(kBase, kScale, kShift);
-}
-
-TYPED_TEST(NeuronLayerTest, TestLogLayerBase2Shift1) {
-  typedef typename TypeParam::Dtype Dtype;
-  const Dtype kBase = 2;
-  const Dtype kScale = 1;
-  const Dtype kShift = 1;
-  this->TestLogForward(kBase, kScale, kShift);
-}
-
-TYPED_TEST(NeuronLayerTest, TestLogGradientBase2Shift1) {
-  typedef typename TypeParam::Dtype Dtype;
-  const Dtype kBase = 2;
-  const Dtype kScale = 1;
-  const Dtype kShift = 1;
-  this->TestLogGradient(kBase, kScale, kShift);
-}
-
-TYPED_TEST(NeuronLayerTest, TestLogLayerBase2Scale3) {
-  typedef typename TypeParam::Dtype Dtype;
-  const Dtype kBase = 2;
-  const Dtype kScale = 3;
-  const Dtype kShift = 0;
-  this->TestLogForward(kBase, kScale, kShift);
-}
-
-TYPED_TEST(NeuronLayerTest, TestLogGradientBase2Scale3) {
-  typedef typename TypeParam::Dtype Dtype;
-  const Dtype kBase = 2;
-  const Dtype kScale = 3;
-  const Dtype kShift = 0;
-  this->TestLogGradient(kBase, kScale, kShift);
-}
-
-TYPED_TEST(NeuronLayerTest, TestLogLayerBase2Shift1Scale3) {
-  typedef typename TypeParam::Dtype Dtype;
-  const Dtype kBase = 2;
-  const Dtype kScale = 3;
-  const Dtype kShift = 1;
-  this->TestLogForward(kBase, kScale, kShift);
-}
-
-TYPED_TEST(NeuronLayerTest, TestLogGradientBase2Shift1Scale3) {
-  typedef typename TypeParam::Dtype Dtype;
-  const Dtype kBase = 2;
-  const Dtype kScale = 3;
-  const Dtype kShift = 1;
-  this->TestLogGradient(kBase, kScale, kShift);
-}
-
-TYPED_TEST(NeuronLayerTest, TestDropoutHalf) {
-  const float kDropoutRatio = 0.5;
-  this->TestDropoutForward(kDropoutRatio);
-}
-
-TYPED_TEST(NeuronLayerTest, TestDropoutThreeQuarters) {
-  const float kDropoutRatio = 0.75;
-  this->TestDropoutForward(kDropoutRatio);
-}
-
-TYPED_TEST(NeuronLayerTest, TestDropoutTestPhase) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  layer_param.set_phase(TEST);
-  DropoutLayer<Dtype> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  // Now, check values
-  const Dtype* bottom_data = this->blob_bottom_->cpu_data();
-  const Dtype* top_data = this->blob_top_->cpu_data();
-  for (int i = 0; i < this->blob_bottom_->count(); ++i) {
-    if (top_data[i] != 0) {
-      EXPECT_EQ(top_data[i], bottom_data[i]);
-    }
-  }
-}
-
-TYPED_TEST(NeuronLayerTest, TestDropoutGradient) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  layer_param.set_phase(TRAIN);
-  DropoutLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-2, 1e-3);
-  checker.CheckGradientEltwise(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_);
-}
-
-TYPED_TEST(NeuronLayerTest, TestDropoutGradientTest) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  layer_param.set_phase(TEST);
-  DropoutLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-2, 1e-3);
-  checker.CheckGradientEltwise(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_);
-}
-
-TYPED_TEST(NeuronLayerTest, TestBNLL) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  BNLLLayer<Dtype> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  // Now, check values
-  const Dtype* bottom_data = this->blob_bottom_->cpu_data();
-  const Dtype* top_data = this->blob_top_->cpu_data();
-  for (int i = 0; i < this->blob_bottom_->count(); ++i) {
-    EXPECT_GE(top_data[i], 0.);
-    EXPECT_GE(top_data[i], bottom_data[i]);
-  }
-}
-
-TYPED_TEST(NeuronLayerTest, TestBNLLGradient) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  BNLLLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-2, 1e-3);
-  checker.CheckGradientEltwise(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_);
-}
-
-TYPED_TEST(NeuronLayerTest, TestPReLUParam) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  PReLULayer<Dtype> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  const Dtype* slopes = layer.blobs()[0]->cpu_data();
-  int count = layer.blobs()[0]->count();
-  for (int i = 0; i < count; ++i, ++slopes) {
-    EXPECT_EQ(*slopes, 0.25);
-  }
-}
-
-TYPED_TEST(NeuronLayerTest, TestPReLUForward) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  PReLULayer<Dtype> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  FillerParameter filler_param;
-  GaussianFiller<Dtype> filler(filler_param);
-  filler.Fill(layer.blobs()[0].get());
-  this->TestPReLU(&layer);
-}
-
-TYPED_TEST(NeuronLayerTest, TestPReLUForwardChannelShared) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  layer_param.mutable_prelu_param()->set_channel_shared(true);
-  PReLULayer<Dtype> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  this->TestPReLU(&layer);
-}
-
-TYPED_TEST(NeuronLayerTest, TestPReLUGradient) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  PReLULayer<Dtype> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  FillerParameter filler_param;
-  GaussianFiller<Dtype> filler(filler_param);
-  filler.Fill(layer.blobs()[0].get());
-  GradientChecker<Dtype> checker(1e-2, 1e-3, 1701, 0., 0.01);
-  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_);
-}
-
-TYPED_TEST(NeuronLayerTest, TestPReLUGradientChannelShared) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  layer_param.mutable_prelu_param()->set_channel_shared(true);
-  PReLULayer<Dtype> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  GradientChecker<Dtype> checker(1e-2, 1e-3, 1701, 0., 0.01);
-  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_);
-}
-
-TYPED_TEST(NeuronLayerTest, TestPReLUConsistencyReLU) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter prelu_layer_param;
-  LayerParameter relu_layer_param;
-  relu_layer_param.mutable_relu_param()->set_negative_slope(0.25);
-  PReLULayer<Dtype> prelu(prelu_layer_param);
-  ReLULayer<Dtype> relu(relu_layer_param);
-  // Set up blobs
-  vector<Blob<Dtype>*> blob_bottom_vec_2;
-  vector<Blob<Dtype>*> blob_top_vec_2;
-  shared_ptr<Blob<Dtype> > blob_bottom_2(new Blob<Dtype>());
-  shared_ptr<Blob<Dtype> > blob_top_2(new Blob<Dtype>());
-  blob_bottom_vec_2.push_back(blob_bottom_2.get());
-  blob_top_vec_2.push_back(blob_top_2.get());
-  blob_bottom_2->CopyFrom(*this->blob_bottom_, false, true);
-  // SetUp layers
-  prelu.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  relu.SetUp(blob_bottom_vec_2, blob_top_vec_2);
-  // Check forward
-  prelu.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  relu.Forward(this->blob_bottom_vec_, blob_top_vec_2);
-  for (int s = 0; s < blob_top_2->count(); ++s) {
-    EXPECT_EQ(this->blob_top_->cpu_data()[s], blob_top_2->cpu_data()[s]);
-  }
-  // Check backward
-  shared_ptr<Blob<Dtype> > tmp_blob(new Blob<Dtype>());
-  tmp_blob->ReshapeLike(*blob_top_2.get());
-  FillerParameter filler_param;
-  GaussianFiller<Dtype> filler(filler_param);
-  filler.Fill(tmp_blob.get());
-  caffe_copy(blob_top_2->count(), tmp_blob->cpu_data(),
-      this->blob_top_->mutable_cpu_diff());
-  caffe_copy(blob_top_2->count(), tmp_blob->cpu_data(),
-      blob_top_2->mutable_cpu_diff());
-  vector<bool> propagate_down;
-  propagate_down.push_back(true);
-  prelu.Backward(this->blob_top_vec_, propagate_down, this->blob_bottom_vec_);
-  relu.Backward(blob_top_vec_2, propagate_down, blob_bottom_vec_2);
-  for (int s = 0; s < blob_bottom_2->count(); ++s) {
-    EXPECT_EQ(this->blob_bottom_->cpu_diff()[s], blob_bottom_2->cpu_diff()[s]);
-  }
-}
-
-TYPED_TEST(NeuronLayerTest, TestPReLUInPlace) {
-  typedef typename TypeParam::Dtype Dtype;
-  // Set layer parameters
-  LayerParameter ip_layer_param;
-  LayerParameter prelu_layer_param;
-  InnerProductParameter *ip_param =
-      ip_layer_param.mutable_inner_product_param();
-  ip_param->mutable_weight_filler()->set_type("gaussian");
-  ip_param->set_num_output(3);
-  InnerProductLayer<Dtype> ip(ip_layer_param);
-  PReLULayer<Dtype> prelu(prelu_layer_param);
-  InnerProductLayer<Dtype> ip2(ip_layer_param);
-  PReLULayer<Dtype> prelu2(prelu_layer_param);
-  // Set up blobs
-  vector<Blob<Dtype>*> blob_bottom_vec_2;
-  vector<Blob<Dtype>*> blob_middle_vec_2;
-  vector<Blob<Dtype>*> blob_top_vec_2;
-  shared_ptr<Blob<Dtype> > blob_bottom_2(new Blob<Dtype>());
-  shared_ptr<Blob<Dtype> > blob_middle_2(new Blob<Dtype>());
-  shared_ptr<Blob<Dtype> > blob_top_2(new Blob<Dtype>());
-  blob_bottom_vec_2.push_back(blob_bottom_2.get());
-  blob_middle_vec_2.push_back(blob_middle_2.get());
-  blob_top_vec_2.push_back(blob_top_2.get());
-  blob_bottom_2->CopyFrom(*this->blob_bottom_, false, true);
-  // SetUp layers
-  ip.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  prelu.SetUp(this->blob_top_vec_, this->blob_top_vec_);
-  ip2.SetUp(blob_bottom_vec_2, blob_middle_vec_2);
-  prelu2.SetUp(blob_middle_vec_2, blob_top_vec_2);
-  caffe_copy(ip2.blobs()[0]->count(), ip.blobs()[0]->cpu_data(),
-      ip2.blobs()[0]->mutable_cpu_data());
-  // Forward in-place
-  ip.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  prelu.Forward(this->blob_top_vec_, this->blob_top_vec_);
-  // Forward non-in-place
-  ip2.Forward(blob_bottom_vec_2, blob_middle_vec_2);
-  prelu2.Forward(blob_middle_vec_2, blob_top_vec_2);
-  // Check numbers
-  for (int s = 0; s < blob_top_2->count(); ++s) {
-    EXPECT_EQ(this->blob_top_->cpu_data()[s], blob_top_2->cpu_data()[s]);
-  }
-  // Fill top diff with random numbers
-  shared_ptr<Blob<Dtype> > tmp_blob(new Blob<Dtype>());
-  tmp_blob->ReshapeLike(*blob_top_2.get());
-  FillerParameter filler_param;
-  GaussianFiller<Dtype> filler(filler_param);
-  filler.Fill(tmp_blob.get());
-  caffe_copy(blob_top_2->count(), tmp_blob->cpu_data(),
-      this->blob_top_->mutable_cpu_diff());
-  caffe_copy(blob_top_2->count(), tmp_blob->cpu_data(),
-      blob_top_2->mutable_cpu_diff());
-  // Backward in-place
-  vector<bool> propagate_down;
-  propagate_down.push_back(true);
-  prelu.Backward(this->blob_top_vec_, propagate_down, this->blob_top_vec_);
-  ip.Backward(this->blob_top_vec_, propagate_down, this->blob_bottom_vec_);
-  // Backward non-in-place
-  prelu2.Backward(blob_top_vec_2, propagate_down, blob_middle_vec_2);
-  ip2.Backward(blob_middle_vec_2, propagate_down, blob_bottom_vec_2);
-  // Check numbers
-  for (int s = 0; s < blob_bottom_2->count(); ++s) {
-    EXPECT_EQ(this->blob_bottom_->cpu_diff()[s], blob_bottom_2->cpu_diff()[s]);
-  }
-  for (int s = 0; s < ip.blobs()[0]->count(); ++s) {
-    EXPECT_EQ(ip.blobs()[0]->cpu_diff()[s], ip2.blobs()[0]->cpu_diff()[s]);
-  }
-  for (int s = 0; s < ip.blobs()[1]->count(); ++s) {
-    EXPECT_EQ(ip.blobs()[1]->cpu_diff()[s], ip2.blobs()[1]->cpu_diff()[s]);
-  }
-  for (int s = 0; s < prelu.blobs()[0]->count(); ++s) {
-    EXPECT_EQ(prelu.blobs()[0]->cpu_diff()[s],
-        prelu2.blobs()[0]->cpu_diff()[s]);
-  }
-}
-
-#ifdef USE_CUDNN
-template <typename Dtype>
-class CuDNNNeuronLayerTest : public GPUDeviceTest<Dtype> {
- protected:
-  CuDNNNeuronLayerTest()
-      : blob_bottom_(new Blob<Dtype>(2, 3, 4, 5)),
-        blob_top_(new Blob<Dtype>()) {
-    Caffe::set_random_seed(1701);
-    // fill the values
-    FillerParameter filler_param;
-    GaussianFiller<Dtype> filler(filler_param);
-    filler.Fill(this->blob_bottom_);
-    blob_bottom_vec_.push_back(blob_bottom_);
-    blob_top_vec_.push_back(blob_top_);
-  }
-  virtual ~CuDNNNeuronLayerTest() { delete blob_bottom_; delete blob_top_; }
-  Blob<Dtype>* const blob_bottom_;
-  Blob<Dtype>* const blob_top_;
-  vector<Blob<Dtype>*> blob_bottom_vec_;
-  vector<Blob<Dtype>*> blob_top_vec_;
-};
-
-TYPED_TEST_CASE(CuDNNNeuronLayerTest, TestDtypes);
-
-TYPED_TEST(CuDNNNeuronLayerTest, TestReLUCuDNN) {
-  LayerParameter layer_param;
-  CuDNNReLULayer<TypeParam> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  // Now, check values
-  const TypeParam* bottom_data = this->blob_bottom_->cpu_data();
-  const TypeParam* top_data = this->blob_top_->cpu_data();
-  for (int i = 0; i < this->blob_bottom_->count(); ++i) {
-    EXPECT_GE(top_data[i], 0.);
-    EXPECT_TRUE(top_data[i] == 0 || top_data[i] == bottom_data[i]);
-  }
-}
-
-TYPED_TEST(CuDNNNeuronLayerTest, TestReLUGradientCuDNN) {
-  LayerParameter layer_param;
-  CuDNNReLULayer<TypeParam> layer(layer_param);
-  GradientChecker<TypeParam> checker(1e-2, 1e-3, 1701, 0., 0.01);
-  checker.CheckGradientEltwise(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_);
-}
-
-TYPED_TEST(CuDNNNeuronLayerTest, TestReLUWithNegativeSlopeCuDNN) {
-  LayerParameter layer_param;
-  CHECK(google::protobuf::TextFormat::ParseFromString(
-      "relu_param { negative_slope: 0.01 }", &layer_param));
-  CuDNNReLULayer<TypeParam> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  // Now, check values
-  const TypeParam* bottom_data = this->blob_bottom_->cpu_data();
-  const TypeParam* top_data = this->blob_top_->cpu_data();
-  for (int i = 0; i < this->blob_bottom_->count(); ++i) {
-    if (top_data[i] >= 0) {
-      EXPECT_FLOAT_EQ(top_data[i], bottom_data[i]);
-    } else {
-      EXPECT_FLOAT_EQ(top_data[i], bottom_data[i] * 0.01);
-    }
-  }
-}
-
-TYPED_TEST(CuDNNNeuronLayerTest, TestReLUGradientWithNegativeSlopeCuDNN) {
-  LayerParameter layer_param;
-  CHECK(google::protobuf::TextFormat::ParseFromString(
-      "relu_param { negative_slope: 0.01 }", &layer_param));
-  CuDNNReLULayer<TypeParam> layer(layer_param);
-  GradientChecker<TypeParam> checker(1e-2, 1e-3, 1701, 0., 0.01);
-  checker.CheckGradientEltwise(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_);
-}
-
-TYPED_TEST(CuDNNNeuronLayerTest, TestSigmoidCuDNN) {
-  LayerParameter layer_param;
-  CuDNNSigmoidLayer<TypeParam> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  // Now, check values
-  const TypeParam* bottom_data = this->blob_bottom_->cpu_data();
-  const TypeParam* top_data = this->blob_top_->cpu_data();
-  for (int i = 0; i < this->blob_bottom_->count(); ++i) {
-    EXPECT_FLOAT_EQ(top_data[i], 1. / (1 + exp(-bottom_data[i])));
-    // check that we squashed the value between 0 and 1
-    EXPECT_GE(top_data[i], 0.);
-    EXPECT_LE(top_data[i], 1.);
-  }
-}
-
-TYPED_TEST(CuDNNNeuronLayerTest, TestSigmoidGradientCuDNN) {
-  LayerParameter layer_param;
-  CuDNNSigmoidLayer<TypeParam> layer(layer_param);
-  GradientChecker<TypeParam> checker(1e-2, 1e-3, 1701, 0., 0.01);
-  checker.CheckGradientEltwise(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_);
-}
-
-TYPED_TEST(CuDNNNeuronLayerTest, TestTanHCuDNN) {
-  LayerParameter layer_param;
-  CuDNNTanHLayer<TypeParam> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  // Test exact values
-  for (int i = 0; i < this->blob_bottom_->num(); ++i) {
-    for (int j = 0; j < this->blob_bottom_->channels(); ++j) {
-      for (int k = 0; k < this->blob_bottom_->height(); ++k) {
-        for (int l = 0; l < this->blob_bottom_->width(); ++l) {
-          EXPECT_GE(this->blob_top_->data_at(i, j, k, l) + 1e-4,
-             (exp(2*this->blob_bottom_->data_at(i, j, k, l)) - 1) /
-             (exp(2*this->blob_bottom_->data_at(i, j, k, l)) + 1));
-          EXPECT_LE(this->blob_top_->data_at(i, j, k, l) - 1e-4,
-             (exp(2*this->blob_bottom_->data_at(i, j, k, l)) - 1) /
-             (exp(2*this->blob_bottom_->data_at(i, j, k, l)) + 1));
-        }
-      }
-    }
-  }
-}
-
-TYPED_TEST(CuDNNNeuronLayerTest, TestTanHGradientCuDNN) {
-  LayerParameter layer_param;
-  CuDNNTanHLayer<TypeParam> layer(layer_param);
-  GradientChecker<TypeParam> checker(1e-2, 1e-3);
-  checker.CheckGradientEltwise(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_);
-}
-#endif
-
-}  // namespace caffe
diff --git a/src/caffe/test/test_platform.cpp b/src/caffe/test/test_platform.cpp
deleted file mode 100644
index f3513e0..0000000
--- a/src/caffe/test/test_platform.cpp
+++ /dev/null
@@ -1,57 +0,0 @@
-#ifndef CPU_ONLY
-
-#include <cstdio>
-#include <cstdlib>
-
-#include "glog/logging.h"
-#include "gtest/gtest.h"
-
-#include "caffe/test/test_caffe_main.hpp"
-
-namespace caffe {
-
-extern cudaDeviceProp CAFFE_TEST_CUDA_PROP;
-
-class PlatformTest : public ::testing::Test {};
-
-TEST_F(PlatformTest, TestInitialization) {
-  printf("Major revision number:         %d\n",  CAFFE_TEST_CUDA_PROP.major);
-  printf("Minor revision number:         %d\n",  CAFFE_TEST_CUDA_PROP.minor);
-  printf("Name:                          %s\n",  CAFFE_TEST_CUDA_PROP.name);
-  printf("Total global memory:           %lu\n",
-         CAFFE_TEST_CUDA_PROP.totalGlobalMem);
-  printf("Total shared memory per block: %lu\n",
-         CAFFE_TEST_CUDA_PROP.sharedMemPerBlock);
-  printf("Total registers per block:     %d\n",
-         CAFFE_TEST_CUDA_PROP.regsPerBlock);
-  printf("Warp size:                     %d\n",
-         CAFFE_TEST_CUDA_PROP.warpSize);
-  printf("Maximum memory pitch:          %lu\n",
-         CAFFE_TEST_CUDA_PROP.memPitch);
-  printf("Maximum threads per block:     %d\n",
-         CAFFE_TEST_CUDA_PROP.maxThreadsPerBlock);
-  for (int i = 0; i < 3; ++i)
-    printf("Maximum dimension %d of block:  %d\n", i,
-           CAFFE_TEST_CUDA_PROP.maxThreadsDim[i]);
-  for (int i = 0; i < 3; ++i)
-    printf("Maximum dimension %d of grid:   %d\n", i,
-           CAFFE_TEST_CUDA_PROP.maxGridSize[i]);
-  printf("Clock rate:                    %d\n", CAFFE_TEST_CUDA_PROP.clockRate);
-  printf("Total constant memory:         %lu\n",
-         CAFFE_TEST_CUDA_PROP.totalConstMem);
-  printf("Texture alignment:             %lu\n",
-         CAFFE_TEST_CUDA_PROP.textureAlignment);
-  printf("Concurrent copy and execution: %s\n",
-         (CAFFE_TEST_CUDA_PROP.deviceOverlap ? "Yes" : "No"));
-  printf("Number of multiprocessors:     %d\n",
-         CAFFE_TEST_CUDA_PROP.multiProcessorCount);
-  printf("Kernel execution timeout:      %s\n",
-         (CAFFE_TEST_CUDA_PROP.kernelExecTimeoutEnabled ? "Yes" : "No"));
-  printf("Unified virtual addressing:    %s\n",
-         (CAFFE_TEST_CUDA_PROP.unifiedAddressing ? "Yes" : "No"));
-  EXPECT_TRUE(true);
-}
-
-}  // namespace caffe
-
-#endif  // CPU_ONLY
diff --git a/src/caffe/test/test_pooling_layer.cpp b/src/caffe/test/test_pooling_layer.cpp
deleted file mode 100644
index bb95cae..0000000
--- a/src/caffe/test/test_pooling_layer.cpp
+++ /dev/null
@@ -1,1185 +0,0 @@
-#include <vector>
-
-#include "gtest/gtest.h"
-
-#include "caffe/blob.hpp"
-#include "caffe/common.hpp"
-#include "caffe/filler.hpp"
-#include "caffe/layers/pooling_layer.hpp"
-
-#ifdef USE_CUDNN
-#include "caffe/layers/cudnn_pooling_layer.hpp"
-#endif
-
-#include "caffe/test/test_caffe_main.hpp"
-#include "caffe/test/test_gradient_check_util.hpp"
-
-namespace caffe {
-
-template <typename TypeParam>
-class PoolingLayerTest : public MultiDeviceTest<TypeParam> {
-  typedef typename TypeParam::Dtype Dtype;
-
- protected:
-  PoolingLayerTest()
-      : blob_bottom_(new Blob<Dtype>()),
-        blob_top_(new Blob<Dtype>()),
-        blob_top_mask_(new Blob<Dtype>()) {}
-  virtual void SetUp() {
-    Caffe::set_random_seed(1701);
-    blob_bottom_->Reshape(2, 3, 6, 5);
-    // fill the values
-    FillerParameter filler_param;
-    GaussianFiller<Dtype> filler(filler_param);
-    filler.Fill(this->blob_bottom_);
-    blob_bottom_vec_.push_back(blob_bottom_);
-    blob_top_vec_.push_back(blob_top_);
-  }
-  virtual ~PoolingLayerTest() {
-    delete blob_bottom_;
-    delete blob_top_;
-    delete blob_top_mask_;
-  }
-  Blob<Dtype>* const blob_bottom_;
-  Blob<Dtype>* const blob_top_;
-  Blob<Dtype>* const blob_top_mask_;
-  vector<Blob<Dtype>*> blob_bottom_vec_;
-  vector<Blob<Dtype>*> blob_top_vec_;
-  // Test for 2x 2 square pooling layer
-  void TestForwardSquare() {
-    LayerParameter layer_param;
-    PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
-    pooling_param->set_kernel_size(2);
-    pooling_param->set_pool(PoolingParameter_PoolMethod_MAX);
-    const int num = 2;
-    const int channels = 2;
-    blob_bottom_->Reshape(num, channels, 3, 5);
-    // Input: 2x 2 channels of:
-    //     [1 2 5 2 3]
-    //     [9 4 1 4 8]
-    //     [1 2 5 2 3]
-    for (int i = 0; i < 15 * num * channels; i += 15) {
-      blob_bottom_->mutable_cpu_data()[i +  0] = 1;
-      blob_bottom_->mutable_cpu_data()[i +  1] = 2;
-      blob_bottom_->mutable_cpu_data()[i +  2] = 5;
-      blob_bottom_->mutable_cpu_data()[i +  3] = 2;
-      blob_bottom_->mutable_cpu_data()[i +  4] = 3;
-      blob_bottom_->mutable_cpu_data()[i +  5] = 9;
-      blob_bottom_->mutable_cpu_data()[i +  6] = 4;
-      blob_bottom_->mutable_cpu_data()[i +  7] = 1;
-      blob_bottom_->mutable_cpu_data()[i +  8] = 4;
-      blob_bottom_->mutable_cpu_data()[i +  9] = 8;
-      blob_bottom_->mutable_cpu_data()[i + 10] = 1;
-      blob_bottom_->mutable_cpu_data()[i + 11] = 2;
-      blob_bottom_->mutable_cpu_data()[i + 12] = 5;
-      blob_bottom_->mutable_cpu_data()[i + 13] = 2;
-      blob_bottom_->mutable_cpu_data()[i + 14] = 3;
-    }
-    PoolingLayer<Dtype> layer(layer_param);
-    layer.SetUp(blob_bottom_vec_, blob_top_vec_);
-    EXPECT_EQ(blob_top_->num(), num);
-    EXPECT_EQ(blob_top_->channels(), channels);
-    EXPECT_EQ(blob_top_->height(), 2);
-    EXPECT_EQ(blob_top_->width(), 4);
-    if (blob_top_vec_.size() > 1) {
-      EXPECT_EQ(blob_top_mask_->num(), num);
-      EXPECT_EQ(blob_top_mask_->channels(), channels);
-      EXPECT_EQ(blob_top_mask_->height(), 2);
-      EXPECT_EQ(blob_top_mask_->width(), 4);
-    }
-    layer.Forward(blob_bottom_vec_, blob_top_vec_);
-    // Expected output: 2x 2 channels of:
-    //     [9 5 5 8]
-    //     [9 5 5 8]
-    for (int i = 0; i < 8 * num * channels; i += 8) {
-      EXPECT_EQ(blob_top_->cpu_data()[i + 0], 9);
-      EXPECT_EQ(blob_top_->cpu_data()[i + 1], 5);
-      EXPECT_EQ(blob_top_->cpu_data()[i + 2], 5);
-      EXPECT_EQ(blob_top_->cpu_data()[i + 3], 8);
-      EXPECT_EQ(blob_top_->cpu_data()[i + 4], 9);
-      EXPECT_EQ(blob_top_->cpu_data()[i + 5], 5);
-      EXPECT_EQ(blob_top_->cpu_data()[i + 6], 5);
-      EXPECT_EQ(blob_top_->cpu_data()[i + 7], 8);
-    }
-    if (blob_top_vec_.size() > 1) {
-      // Expected mask output: 2x 2 channels of:
-      //     [5  2  2 9]
-      //     [5 12 12 9]
-      for (int i = 0; i < 8 * num * channels; i += 8) {
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i + 0],  5);
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i + 1],  2);
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i + 2],  2);
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i + 3],  9);
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i + 4],  5);
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i + 5], 12);
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i + 6], 12);
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i + 7],  9);
-      }
-    }
-  }
-  // Test for 3x 2 rectangular pooling layer with kernel_h > kernel_w
-  void TestForwardRectHigh() {
-    LayerParameter layer_param;
-    PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
-    pooling_param->set_kernel_h(3);
-    pooling_param->set_kernel_w(2);
-    pooling_param->set_pool(PoolingParameter_PoolMethod_MAX);
-    const int num = 2;
-    const int channels = 2;
-    blob_bottom_->Reshape(num, channels, 6, 6);
-    // Input: 2x 2 channels of:
-    // [35     1     6    26    19    24]
-    // [ 3    32     7    21    23    25]
-    // [31     9     2    22    27    20]
-    // [ 8    28    33    17    10    15]
-    // [30     5    34    12    14    16]
-    // [ 4    36    29    13    18    11]
-    // (this is generated by magic(6) in MATLAB)
-    for (int i = 0; i < 36 * num * channels; i += 36) {
-      blob_bottom_->mutable_cpu_data()[i +  0] = 35;
-      blob_bottom_->mutable_cpu_data()[i +  1] = 1;
-      blob_bottom_->mutable_cpu_data()[i +  2] = 6;
-      blob_bottom_->mutable_cpu_data()[i +  3] = 26;
-      blob_bottom_->mutable_cpu_data()[i +  4] = 19;
-      blob_bottom_->mutable_cpu_data()[i +  5] = 24;
-      blob_bottom_->mutable_cpu_data()[i +  6] = 3;
-      blob_bottom_->mutable_cpu_data()[i +  7] = 32;
-      blob_bottom_->mutable_cpu_data()[i +  8] = 7;
-      blob_bottom_->mutable_cpu_data()[i +  9] = 21;
-      blob_bottom_->mutable_cpu_data()[i + 10] = 23;
-      blob_bottom_->mutable_cpu_data()[i + 11] = 25;
-      blob_bottom_->mutable_cpu_data()[i + 12] = 31;
-      blob_bottom_->mutable_cpu_data()[i + 13] = 9;
-      blob_bottom_->mutable_cpu_data()[i + 14] = 2;
-      blob_bottom_->mutable_cpu_data()[i + 15] = 22;
-      blob_bottom_->mutable_cpu_data()[i + 16] = 27;
-      blob_bottom_->mutable_cpu_data()[i + 17] = 20;
-      blob_bottom_->mutable_cpu_data()[i + 18] = 8;
-      blob_bottom_->mutable_cpu_data()[i + 19] = 28;
-      blob_bottom_->mutable_cpu_data()[i + 20] = 33;
-      blob_bottom_->mutable_cpu_data()[i + 21] = 17;
-      blob_bottom_->mutable_cpu_data()[i + 22] = 10;
-      blob_bottom_->mutable_cpu_data()[i + 23] = 15;
-      blob_bottom_->mutable_cpu_data()[i + 24] = 30;
-      blob_bottom_->mutable_cpu_data()[i + 25] = 5;
-      blob_bottom_->mutable_cpu_data()[i + 26] = 34;
-      blob_bottom_->mutable_cpu_data()[i + 27] = 12;
-      blob_bottom_->mutable_cpu_data()[i + 28] = 14;
-      blob_bottom_->mutable_cpu_data()[i + 29] = 16;
-      blob_bottom_->mutable_cpu_data()[i + 30] = 4;
-      blob_bottom_->mutable_cpu_data()[i + 31] = 36;
-      blob_bottom_->mutable_cpu_data()[i + 32] = 29;
-      blob_bottom_->mutable_cpu_data()[i + 33] = 13;
-      blob_bottom_->mutable_cpu_data()[i + 34] = 18;
-      blob_bottom_->mutable_cpu_data()[i + 35] = 11;
-    }
-    PoolingLayer<Dtype> layer(layer_param);
-    layer.SetUp(blob_bottom_vec_, blob_top_vec_);
-    EXPECT_EQ(blob_top_->num(), num);
-    EXPECT_EQ(blob_top_->channels(), channels);
-    EXPECT_EQ(blob_top_->height(), 4);
-    EXPECT_EQ(blob_top_->width(), 5);
-    if (blob_top_vec_.size() > 1) {
-      EXPECT_EQ(blob_top_mask_->num(), num);
-      EXPECT_EQ(blob_top_mask_->channels(), channels);
-      EXPECT_EQ(blob_top_mask_->height(), 4);
-      EXPECT_EQ(blob_top_mask_->width(), 5);
-    }
-    layer.Forward(blob_bottom_vec_, blob_top_vec_);
-    // Expected output: 2x 2 channels of:
-    // [35    32    26    27    27]
-    // [32    33    33    27    27]
-    // [31    34    34    27    27]
-    // [36    36    34    18    18]
-    for (int i = 0; i < 20 * num * channels; i += 20) {
-      EXPECT_EQ(blob_top_->cpu_data()[i +  0], 35);
-      EXPECT_EQ(blob_top_->cpu_data()[i +  1], 32);
-      EXPECT_EQ(blob_top_->cpu_data()[i +  2], 26);
-      EXPECT_EQ(blob_top_->cpu_data()[i +  3], 27);
-      EXPECT_EQ(blob_top_->cpu_data()[i +  4], 27);
-      EXPECT_EQ(blob_top_->cpu_data()[i +  5], 32);
-      EXPECT_EQ(blob_top_->cpu_data()[i +  6], 33);
-      EXPECT_EQ(blob_top_->cpu_data()[i +  7], 33);
-      EXPECT_EQ(blob_top_->cpu_data()[i +  8], 27);
-      EXPECT_EQ(blob_top_->cpu_data()[i +  9], 27);
-      EXPECT_EQ(blob_top_->cpu_data()[i + 10], 31);
-      EXPECT_EQ(blob_top_->cpu_data()[i + 11], 34);
-      EXPECT_EQ(blob_top_->cpu_data()[i + 12], 34);
-      EXPECT_EQ(blob_top_->cpu_data()[i + 13], 27);
-      EXPECT_EQ(blob_top_->cpu_data()[i + 14], 27);
-      EXPECT_EQ(blob_top_->cpu_data()[i + 15], 36);
-      EXPECT_EQ(blob_top_->cpu_data()[i + 16], 36);
-      EXPECT_EQ(blob_top_->cpu_data()[i + 17], 34);
-      EXPECT_EQ(blob_top_->cpu_data()[i + 18], 18);
-      EXPECT_EQ(blob_top_->cpu_data()[i + 19], 18);
-    }
-    if (blob_top_vec_.size() > 1) {
-        // [ 1     8     4    17    17]
-        // [ 8    21    21    17    17]
-        // [13    27    27    17    17]
-        // [32    32    27    35    35]
-      for (int i = 0; i < 20 * num * channels; i += 20) {
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i +  0],  0);
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i +  1],  7);
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i +  2],  3);
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i +  3], 16);
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i +  4], 16);
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i +  5],  7);
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i +  6], 20);
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i +  7], 20);
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i +  8], 16);
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i +  9], 16);
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i + 10], 12);
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i + 11], 26);
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i + 12], 26);
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i + 13], 16);
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i + 14], 16);
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i + 15], 31);
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i + 16], 31);
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i + 17], 26);
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i + 18], 34);
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i + 19], 34);
-      }
-    }
-  }
-  // Test for rectangular pooling layer with kernel_w > kernel_h
-  void TestForwardRectWide() {
-    LayerParameter layer_param;
-    PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
-    pooling_param->set_kernel_h(2);
-    pooling_param->set_kernel_w(3);
-    pooling_param->set_pool(PoolingParameter_PoolMethod_MAX);
-    const int num = 2;
-    const int channels = 2;
-    blob_bottom_->Reshape(num, channels, 6, 6);
-    // Input: 2x 2 channels of:
-    // [35     1     6    26    19    24]
-    // [ 3    32     7    21    23    25]
-    // [31     9     2    22    27    20]
-    // [ 8    28    33    17    10    15]
-    // [30     5    34    12    14    16]
-    // [ 4    36    29    13    18    11]
-    // (this is generated by magic(6) in MATLAB)
-    for (int i = 0; i < 36 * num * channels; i += 36) {
-      blob_bottom_->mutable_cpu_data()[i +  0] = 35;
-      blob_bottom_->mutable_cpu_data()[i +  1] = 1;
-      blob_bottom_->mutable_cpu_data()[i +  2] = 6;
-      blob_bottom_->mutable_cpu_data()[i +  3] = 26;
-      blob_bottom_->mutable_cpu_data()[i +  4] = 19;
-      blob_bottom_->mutable_cpu_data()[i +  5] = 24;
-      blob_bottom_->mutable_cpu_data()[i +  6] = 3;
-      blob_bottom_->mutable_cpu_data()[i +  7] = 32;
-      blob_bottom_->mutable_cpu_data()[i +  8] = 7;
-      blob_bottom_->mutable_cpu_data()[i +  9] = 21;
-      blob_bottom_->mutable_cpu_data()[i + 10] = 23;
-      blob_bottom_->mutable_cpu_data()[i + 11] = 25;
-      blob_bottom_->mutable_cpu_data()[i + 12] = 31;
-      blob_bottom_->mutable_cpu_data()[i + 13] = 9;
-      blob_bottom_->mutable_cpu_data()[i + 14] = 2;
-      blob_bottom_->mutable_cpu_data()[i + 15] = 22;
-      blob_bottom_->mutable_cpu_data()[i + 16] = 27;
-      blob_bottom_->mutable_cpu_data()[i + 17] = 20;
-      blob_bottom_->mutable_cpu_data()[i + 18] = 8;
-      blob_bottom_->mutable_cpu_data()[i + 19] = 28;
-      blob_bottom_->mutable_cpu_data()[i + 20] = 33;
-      blob_bottom_->mutable_cpu_data()[i + 21] = 17;
-      blob_bottom_->mutable_cpu_data()[i + 22] = 10;
-      blob_bottom_->mutable_cpu_data()[i + 23] = 15;
-      blob_bottom_->mutable_cpu_data()[i + 24] = 30;
-      blob_bottom_->mutable_cpu_data()[i + 25] = 5;
-      blob_bottom_->mutable_cpu_data()[i + 26] = 34;
-      blob_bottom_->mutable_cpu_data()[i + 27] = 12;
-      blob_bottom_->mutable_cpu_data()[i + 28] = 14;
-      blob_bottom_->mutable_cpu_data()[i + 29] = 16;
-      blob_bottom_->mutable_cpu_data()[i + 30] = 4;
-      blob_bottom_->mutable_cpu_data()[i + 31] = 36;
-      blob_bottom_->mutable_cpu_data()[i + 32] = 29;
-      blob_bottom_->mutable_cpu_data()[i + 33] = 13;
-      blob_bottom_->mutable_cpu_data()[i + 34] = 18;
-      blob_bottom_->mutable_cpu_data()[i + 35] = 11;
-    }
-    PoolingLayer<Dtype> layer(layer_param);
-    layer.SetUp(blob_bottom_vec_, blob_top_vec_);
-    EXPECT_EQ(blob_top_->num(), num);
-    EXPECT_EQ(blob_top_->channels(), channels);
-    EXPECT_EQ(blob_top_->height(), 5);
-    EXPECT_EQ(blob_top_->width(), 4);
-    if (blob_top_vec_.size() > 1) {
-      EXPECT_EQ(blob_top_mask_->num(), num);
-      EXPECT_EQ(blob_top_mask_->channels(), channels);
-      EXPECT_EQ(blob_top_mask_->height(), 5);
-      EXPECT_EQ(blob_top_mask_->width(), 4);
-    }
-    layer.Forward(blob_bottom_vec_, blob_top_vec_);
-    // Expected output: 2x 2 channels of:
-    // [35    32    26    26]
-    // [32    32    27    27]
-    // [33    33    33    27]
-    // [34    34    34    17]
-    // [36    36    34    18]
-    for (int i = 0; i < 20 * num * channels; i += 20) {
-      EXPECT_EQ(blob_top_->cpu_data()[i +  0], 35);
-      EXPECT_EQ(blob_top_->cpu_data()[i +  1], 32);
-      EXPECT_EQ(blob_top_->cpu_data()[i +  2], 26);
-      EXPECT_EQ(blob_top_->cpu_data()[i +  3], 26);
-      EXPECT_EQ(blob_top_->cpu_data()[i +  4], 32);
-      EXPECT_EQ(blob_top_->cpu_data()[i +  5], 32);
-      EXPECT_EQ(blob_top_->cpu_data()[i +  6], 27);
-      EXPECT_EQ(blob_top_->cpu_data()[i +  7], 27);
-      EXPECT_EQ(blob_top_->cpu_data()[i +  8], 33);
-      EXPECT_EQ(blob_top_->cpu_data()[i +  9], 33);
-      EXPECT_EQ(blob_top_->cpu_data()[i + 10], 33);
-      EXPECT_EQ(blob_top_->cpu_data()[i + 11], 27);
-      EXPECT_EQ(blob_top_->cpu_data()[i + 12], 34);
-      EXPECT_EQ(blob_top_->cpu_data()[i + 13], 34);
-      EXPECT_EQ(blob_top_->cpu_data()[i + 14], 34);
-      EXPECT_EQ(blob_top_->cpu_data()[i + 15], 17);
-      EXPECT_EQ(blob_top_->cpu_data()[i + 16], 36);
-      EXPECT_EQ(blob_top_->cpu_data()[i + 17], 36);
-      EXPECT_EQ(blob_top_->cpu_data()[i + 18], 34);
-      EXPECT_EQ(blob_top_->cpu_data()[i + 19], 18);
-    }
-    if (blob_top_vec_.size() > 1) {
-        // [ 1     8     4     4]
-        // [ 8     8    17    17]
-        // [21    21    21    17]
-        // [27    27    27    22]
-        // [32    32    27    35]
-      for (int i = 0; i < 20 * num * channels; i += 20) {
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i +  0],  0);
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i +  1],  7);
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i +  2],  3);
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i +  3],  3);
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i +  4],  7);
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i +  5],  7);
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i +  6], 16);
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i +  7], 16);
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i +  8], 20);
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i +  9], 20);
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i + 10], 20);
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i + 11], 16);
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i + 12], 26);
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i + 13], 26);
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i + 14], 26);
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i + 15], 21);
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i + 16], 31);
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i + 17], 31);
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i + 18], 26);
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i + 19], 34);
-      }
-    }
-  }
-};
-
-TYPED_TEST_CASE(PoolingLayerTest, TestDtypesAndDevices);
-
-TYPED_TEST(PoolingLayerTest, TestSetup) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
-  pooling_param->set_kernel_size(3);
-  pooling_param->set_stride(2);
-  PoolingLayer<Dtype> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  EXPECT_EQ(this->blob_top_->num(), this->blob_bottom_->num());
-  EXPECT_EQ(this->blob_top_->channels(), this->blob_bottom_->channels());
-  EXPECT_EQ(this->blob_top_->height(), 3);
-  EXPECT_EQ(this->blob_top_->width(), 2);
-}
-
-TYPED_TEST(PoolingLayerTest, TestSetupPadded) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
-  pooling_param->set_kernel_size(3);
-  pooling_param->set_stride(2);
-  pooling_param->set_pad(1);
-  pooling_param->set_pool(PoolingParameter_PoolMethod_AVE);
-  PoolingLayer<Dtype> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  EXPECT_EQ(this->blob_top_->num(), this->blob_bottom_->num());
-  EXPECT_EQ(this->blob_top_->channels(), this->blob_bottom_->channels());
-  EXPECT_EQ(this->blob_top_->height(), 4);
-  EXPECT_EQ(this->blob_top_->width(), 3);
-}
-
-TYPED_TEST(PoolingLayerTest, TestSetupGlobalPooling) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
-  pooling_param->set_global_pooling(true);
-  pooling_param->set_pool(PoolingParameter_PoolMethod_AVE);
-  PoolingLayer<Dtype> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  EXPECT_EQ(this->blob_top_->num(), this->blob_bottom_->num());
-  EXPECT_EQ(this->blob_top_->channels(), this->blob_bottom_->channels());
-  EXPECT_EQ(this->blob_top_->height(), 1);
-  EXPECT_EQ(this->blob_top_->width(), 1);
-}
-
-/*
-TYPED_TEST(PoolingLayerTest, PrintBackward) {
-  LayerParameter layer_param;
-  layer_param.set_kernelsize(3);
-  layer_param.set_stride(2);
-  layer_param.set_pool(LayerParameter_PoolMethod_MAX);
-  PoolingLayer<TypeParam> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  for (int i = 0; i < this->blob_bottom_->count(); ++i) {
-    cout << "bottom data " << i << " " << this->blob_bottom_->cpu_data()[i] << endl;
-  }
-  for (int i = 0; i < this->blob_top_->count(); ++i) {
-    cout << "top data " << i << " " << this->blob_top_->cpu_data()[i] << endl;
-  }
-
-  for (int i = 0; i < this->blob_top_->count(); ++i) {
-    this->blob_top_->mutable_cpu_diff()[i] = i;
-  }
-  layer.Backward(this->blob_top_vec_, true, this->blob_bottom_vec_);
-  for (int i = 0; i < this->blob_bottom_->count(); ++i) {
-    cout << "bottom diff " << i << " " << this->blob_bottom_->cpu_diff()[i] << endl;
-  }
-}
-*/
-
-TYPED_TEST(PoolingLayerTest, TestForwardMax) {
-  this->TestForwardSquare();
-  this->TestForwardRectHigh();
-  this->TestForwardRectWide();
-}
-
-TYPED_TEST(PoolingLayerTest, TestForwardMaxTopMask) {
-  this->blob_top_vec_.push_back(this->blob_top_mask_);
-  this->TestForwardSquare();
-  this->TestForwardRectHigh();
-  this->TestForwardRectWide();
-}
-
-TYPED_TEST(PoolingLayerTest, TestGradientMax) {
-  typedef typename TypeParam::Dtype Dtype;
-  for (int kernel_h = 3; kernel_h <= 4; kernel_h++) {
-    for (int kernel_w = 3; kernel_w <= 4; kernel_w++) {
-      LayerParameter layer_param;
-      PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
-      pooling_param->set_kernel_h(kernel_h);
-      pooling_param->set_kernel_w(kernel_w);
-      pooling_param->set_stride(2);
-      pooling_param->set_pad(1);
-      pooling_param->set_pool(PoolingParameter_PoolMethod_MAX);
-      PoolingLayer<Dtype> layer(layer_param);
-      GradientChecker<Dtype> checker(1e-4, 1e-2);
-      checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-          this->blob_top_vec_);
-    }
-  }
-}
-
-TYPED_TEST(PoolingLayerTest, TestForwardMaxPadded) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
-  pooling_param->set_kernel_size(3);
-  pooling_param->set_stride(2);
-  pooling_param->set_pad(2);
-  pooling_param->set_pool(PoolingParameter_PoolMethod_MAX);
-  this->blob_bottom_->Reshape(1, 1, 3, 3);
-  // Input:
-  //     [ 1 2 4 ]
-  //     [ 2 3 2 ]
-  //     [ 4 2 1 ]
-  this->blob_bottom_->mutable_cpu_data()[0] = 1;
-  this->blob_bottom_->mutable_cpu_data()[1] = 2;
-  this->blob_bottom_->mutable_cpu_data()[2] = 4;
-  this->blob_bottom_->mutable_cpu_data()[3] = 2;
-  this->blob_bottom_->mutable_cpu_data()[4] = 3;
-  this->blob_bottom_->mutable_cpu_data()[5] = 2;
-  this->blob_bottom_->mutable_cpu_data()[6] = 4;
-  this->blob_bottom_->mutable_cpu_data()[7] = 2;
-  this->blob_bottom_->mutable_cpu_data()[8] = 1;
-  PoolingLayer<Dtype> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  EXPECT_EQ(this->blob_top_->num(), 1);
-  EXPECT_EQ(this->blob_top_->channels(), 1);
-  EXPECT_EQ(this->blob_top_->height(), 3);
-  EXPECT_EQ(this->blob_top_->width(), 3);
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  Dtype epsilon = 1e-8;
-  // Output:
-  //     [ 1 4 4 ]
-  //     [ 4 4 4 ]
-  //     [ 4 4 1 ]
-  EXPECT_NEAR(this->blob_top_->cpu_data()[0], 1, epsilon);
-  EXPECT_NEAR(this->blob_top_->cpu_data()[1], 4, epsilon);
-  EXPECT_NEAR(this->blob_top_->cpu_data()[2], 4, epsilon);
-  EXPECT_NEAR(this->blob_top_->cpu_data()[3], 4, epsilon);
-  EXPECT_NEAR(this->blob_top_->cpu_data()[4], 4, epsilon);
-  EXPECT_NEAR(this->blob_top_->cpu_data()[5], 4, epsilon);
-  EXPECT_NEAR(this->blob_top_->cpu_data()[6], 4, epsilon);
-  EXPECT_NEAR(this->blob_top_->cpu_data()[7], 4, epsilon);
-  EXPECT_NEAR(this->blob_top_->cpu_data()[8], 1, epsilon);
-}
-
-TYPED_TEST(PoolingLayerTest, TestGradientMaxTopMask) {
-  typedef typename TypeParam::Dtype Dtype;
-  for (int kernel_h = 3; kernel_h <= 4; kernel_h++) {
-    for (int kernel_w = 3; kernel_w <= 4; kernel_w++) {
-      LayerParameter layer_param;
-      PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
-      pooling_param->set_kernel_h(kernel_h);
-      pooling_param->set_kernel_w(kernel_w);
-      pooling_param->set_stride(2);
-      pooling_param->set_pool(PoolingParameter_PoolMethod_MAX);
-      this->blob_top_vec_.push_back(this->blob_top_mask_);
-      PoolingLayer<Dtype> layer(layer_param);
-      GradientChecker<Dtype> checker(1e-4, 1e-2);
-      checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-          this->blob_top_vec_);
-      this->blob_top_vec_.pop_back();
-    }
-  }
-}
-
-TYPED_TEST(PoolingLayerTest, TestForwardAve) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
-  pooling_param->set_kernel_size(3);
-  pooling_param->set_stride(1);
-  pooling_param->set_pad(1);
-  pooling_param->set_pool(PoolingParameter_PoolMethod_AVE);
-  this->blob_bottom_->Reshape(1, 1, 3, 3);
-  FillerParameter filler_param;
-  filler_param.set_value(Dtype(2));
-  ConstantFiller<Dtype> filler(filler_param);
-  filler.Fill(this->blob_bottom_);
-  PoolingLayer<Dtype> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  EXPECT_EQ(this->blob_top_->num(), 1);
-  EXPECT_EQ(this->blob_top_->channels(), 1);
-  EXPECT_EQ(this->blob_top_->height(), 3);
-  EXPECT_EQ(this->blob_top_->width(), 3);
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  Dtype epsilon = 1e-5;
-  EXPECT_NEAR(this->blob_top_->cpu_data()[0], 8.0 / 9, epsilon);
-  EXPECT_NEAR(this->blob_top_->cpu_data()[1], 4.0 / 3, epsilon);
-  EXPECT_NEAR(this->blob_top_->cpu_data()[2], 8.0 / 9, epsilon);
-  EXPECT_NEAR(this->blob_top_->cpu_data()[3], 4.0 / 3, epsilon);
-  EXPECT_NEAR(this->blob_top_->cpu_data()[4], 2.0    , epsilon);
-  EXPECT_NEAR(this->blob_top_->cpu_data()[5], 4.0 / 3, epsilon);
-  EXPECT_NEAR(this->blob_top_->cpu_data()[6], 8.0 / 9, epsilon);
-  EXPECT_NEAR(this->blob_top_->cpu_data()[7], 4.0 / 3, epsilon);
-  EXPECT_NEAR(this->blob_top_->cpu_data()[8], 8.0 / 9, epsilon);
-}
-
-TYPED_TEST(PoolingLayerTest, TestGradientAve) {
-  typedef typename TypeParam::Dtype Dtype;
-  for (int kernel_h = 3; kernel_h <= 4; kernel_h++) {
-    for (int kernel_w = 3; kernel_w <= 4; kernel_w++) {
-      LayerParameter layer_param;
-      PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
-      pooling_param->set_kernel_h(kernel_h);
-      pooling_param->set_kernel_w(kernel_w);
-      pooling_param->set_stride(2);
-      pooling_param->set_pool(PoolingParameter_PoolMethod_AVE);
-      PoolingLayer<Dtype> layer(layer_param);
-      GradientChecker<Dtype> checker(1e-2, 1e-2);
-      checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-          this->blob_top_vec_);
-    }
-  }
-}
-
-TYPED_TEST(PoolingLayerTest, TestGradientAvePadded) {
-  typedef typename TypeParam::Dtype Dtype;
-  for (int kernel_h = 3; kernel_h <= 4; kernel_h++) {
-    for (int kernel_w = 3; kernel_w <= 4; kernel_w++) {
-      LayerParameter layer_param;
-      PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
-      pooling_param->set_kernel_h(kernel_h);
-      pooling_param->set_kernel_w(kernel_w);
-      pooling_param->set_stride(2);
-      pooling_param->set_pad(2);
-      pooling_param->set_pool(PoolingParameter_PoolMethod_AVE);
-      PoolingLayer<Dtype> layer(layer_param);
-      GradientChecker<Dtype> checker(1e-2, 1e-2);
-      checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-          this->blob_top_vec_);
-    }
-  }
-}
-
-#ifdef USE_CUDNN
-template <typename Dtype>
-class CuDNNPoolingLayerTest : public GPUDeviceTest<Dtype> {
- protected:
-  CuDNNPoolingLayerTest()
-      : blob_bottom_(new Blob<Dtype>()),
-        blob_top_(new Blob<Dtype>()),
-        blob_top_mask_(new Blob<Dtype>()) {}
-  virtual void SetUp() {
-    Caffe::set_random_seed(1701);
-    blob_bottom_->Reshape(2, 3, 6, 5);
-    // fill the values
-    FillerParameter filler_param;
-    GaussianFiller<Dtype> filler(filler_param);
-    filler.Fill(this->blob_bottom_);
-    blob_bottom_vec_.push_back(blob_bottom_);
-    blob_top_vec_.push_back(blob_top_);
-  }
-  virtual ~CuDNNPoolingLayerTest() {
-    delete blob_bottom_;
-    delete blob_top_;
-    delete blob_top_mask_;
-  }
-  Blob<Dtype>* const blob_bottom_;
-  Blob<Dtype>* const blob_top_;
-  Blob<Dtype>* const blob_top_mask_;
-  vector<Blob<Dtype>*> blob_bottom_vec_;
-  vector<Blob<Dtype>*> blob_top_vec_;
-  // Test for 2x 2 square pooling layer
-  void TestForwardSquare() {
-    LayerParameter layer_param;
-    PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
-    pooling_param->set_kernel_size(2);
-    pooling_param->set_pool(PoolingParameter_PoolMethod_MAX);
-    const int num = 2;
-    const int channels = 2;
-    blob_bottom_->Reshape(num, channels, 3, 5);
-    // Input: 2x 2 channels of:
-    //     [1 2 5 2 3]
-    //     [9 4 1 4 8]
-    //     [1 2 5 2 3]
-    for (int i = 0; i < 15 * num * channels; i += 15) {
-      blob_bottom_->mutable_cpu_data()[i +  0] = 1;
-      blob_bottom_->mutable_cpu_data()[i +  1] = 2;
-      blob_bottom_->mutable_cpu_data()[i +  2] = 5;
-      blob_bottom_->mutable_cpu_data()[i +  3] = 2;
-      blob_bottom_->mutable_cpu_data()[i +  4] = 3;
-      blob_bottom_->mutable_cpu_data()[i +  5] = 9;
-      blob_bottom_->mutable_cpu_data()[i +  6] = 4;
-      blob_bottom_->mutable_cpu_data()[i +  7] = 1;
-      blob_bottom_->mutable_cpu_data()[i +  8] = 4;
-      blob_bottom_->mutable_cpu_data()[i +  9] = 8;
-      blob_bottom_->mutable_cpu_data()[i + 10] = 1;
-      blob_bottom_->mutable_cpu_data()[i + 11] = 2;
-      blob_bottom_->mutable_cpu_data()[i + 12] = 5;
-      blob_bottom_->mutable_cpu_data()[i + 13] = 2;
-      blob_bottom_->mutable_cpu_data()[i + 14] = 3;
-    }
-    CuDNNPoolingLayer<Dtype> layer(layer_param);
-    layer.SetUp(blob_bottom_vec_, blob_top_vec_);
-    EXPECT_EQ(blob_top_->num(), num);
-    EXPECT_EQ(blob_top_->channels(), channels);
-    EXPECT_EQ(blob_top_->height(), 2);
-    EXPECT_EQ(blob_top_->width(), 4);
-    if (blob_top_vec_.size() > 1) {
-      EXPECT_EQ(blob_top_mask_->num(), num);
-      EXPECT_EQ(blob_top_mask_->channels(), channels);
-      EXPECT_EQ(blob_top_mask_->height(), 2);
-      EXPECT_EQ(blob_top_mask_->width(), 4);
-    }
-    layer.Forward(blob_bottom_vec_, blob_top_vec_);
-    // Expected output: 2x 2 channels of:
-    //     [9 5 5 8]
-    //     [9 5 5 8]
-    for (int i = 0; i < 8 * num * channels; i += 8) {
-      EXPECT_EQ(blob_top_->cpu_data()[i + 0], 9);
-      EXPECT_EQ(blob_top_->cpu_data()[i + 1], 5);
-      EXPECT_EQ(blob_top_->cpu_data()[i + 2], 5);
-      EXPECT_EQ(blob_top_->cpu_data()[i + 3], 8);
-      EXPECT_EQ(blob_top_->cpu_data()[i + 4], 9);
-      EXPECT_EQ(blob_top_->cpu_data()[i + 5], 5);
-      EXPECT_EQ(blob_top_->cpu_data()[i + 6], 5);
-      EXPECT_EQ(blob_top_->cpu_data()[i + 7], 8);
-    }
-    if (blob_top_vec_.size() > 1) {
-      // Expected mask output: 2x 2 channels of:
-      //     [5  2  2 9]
-      //     [5 12 12 9]
-      for (int i = 0; i < 8 * num * channels; i += 8) {
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i + 0],  5);
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i + 1],  2);
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i + 2],  2);
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i + 3],  9);
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i + 4],  5);
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i + 5], 12);
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i + 6], 12);
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i + 7],  9);
-      }
-    }
-  }
-  // Test for 3x 2 rectangular pooling layer with kernel_h > kernel_w
-  void TestForwardRectHigh() {
-    LayerParameter layer_param;
-    PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
-    pooling_param->set_kernel_h(3);
-    pooling_param->set_kernel_w(2);
-    pooling_param->set_pool(PoolingParameter_PoolMethod_MAX);
-    const int num = 2;
-    const int channels = 2;
-    blob_bottom_->Reshape(num, channels, 6, 6);
-    // Input: 2x 2 channels of:
-    // [35     1     6    26    19    24]
-    // [ 3    32     7    21    23    25]
-    // [31     9     2    22    27    20]
-    // [ 8    28    33    17    10    15]
-    // [30     5    34    12    14    16]
-    // [ 4    36    29    13    18    11]
-    // (this is generated by magic(6) in MATLAB)
-    for (int i = 0; i < 36 * num * channels; i += 36) {
-      blob_bottom_->mutable_cpu_data()[i +  0] = 35;
-      blob_bottom_->mutable_cpu_data()[i +  1] = 1;
-      blob_bottom_->mutable_cpu_data()[i +  2] = 6;
-      blob_bottom_->mutable_cpu_data()[i +  3] = 26;
-      blob_bottom_->mutable_cpu_data()[i +  4] = 19;
-      blob_bottom_->mutable_cpu_data()[i +  5] = 24;
-      blob_bottom_->mutable_cpu_data()[i +  6] = 3;
-      blob_bottom_->mutable_cpu_data()[i +  7] = 32;
-      blob_bottom_->mutable_cpu_data()[i +  8] = 7;
-      blob_bottom_->mutable_cpu_data()[i +  9] = 21;
-      blob_bottom_->mutable_cpu_data()[i + 10] = 23;
-      blob_bottom_->mutable_cpu_data()[i + 11] = 25;
-      blob_bottom_->mutable_cpu_data()[i + 12] = 31;
-      blob_bottom_->mutable_cpu_data()[i + 13] = 9;
-      blob_bottom_->mutable_cpu_data()[i + 14] = 2;
-      blob_bottom_->mutable_cpu_data()[i + 15] = 22;
-      blob_bottom_->mutable_cpu_data()[i + 16] = 27;
-      blob_bottom_->mutable_cpu_data()[i + 17] = 20;
-      blob_bottom_->mutable_cpu_data()[i + 18] = 8;
-      blob_bottom_->mutable_cpu_data()[i + 19] = 28;
-      blob_bottom_->mutable_cpu_data()[i + 20] = 33;
-      blob_bottom_->mutable_cpu_data()[i + 21] = 17;
-      blob_bottom_->mutable_cpu_data()[i + 22] = 10;
-      blob_bottom_->mutable_cpu_data()[i + 23] = 15;
-      blob_bottom_->mutable_cpu_data()[i + 24] = 30;
-      blob_bottom_->mutable_cpu_data()[i + 25] = 5;
-      blob_bottom_->mutable_cpu_data()[i + 26] = 34;
-      blob_bottom_->mutable_cpu_data()[i + 27] = 12;
-      blob_bottom_->mutable_cpu_data()[i + 28] = 14;
-      blob_bottom_->mutable_cpu_data()[i + 29] = 16;
-      blob_bottom_->mutable_cpu_data()[i + 30] = 4;
-      blob_bottom_->mutable_cpu_data()[i + 31] = 36;
-      blob_bottom_->mutable_cpu_data()[i + 32] = 29;
-      blob_bottom_->mutable_cpu_data()[i + 33] = 13;
-      blob_bottom_->mutable_cpu_data()[i + 34] = 18;
-      blob_bottom_->mutable_cpu_data()[i + 35] = 11;
-    }
-    CuDNNPoolingLayer<Dtype> layer(layer_param);
-    layer.SetUp(blob_bottom_vec_, blob_top_vec_);
-    EXPECT_EQ(blob_top_->num(), num);
-    EXPECT_EQ(blob_top_->channels(), channels);
-    EXPECT_EQ(blob_top_->height(), 4);
-    EXPECT_EQ(blob_top_->width(), 5);
-    if (blob_top_vec_.size() > 1) {
-      EXPECT_EQ(blob_top_mask_->num(), num);
-      EXPECT_EQ(blob_top_mask_->channels(), channels);
-      EXPECT_EQ(blob_top_mask_->height(), 4);
-      EXPECT_EQ(blob_top_mask_->width(), 5);
-    }
-    layer.Forward(blob_bottom_vec_, blob_top_vec_);
-    // Expected output: 2x 2 channels of:
-    // [35    32    26    27    27]
-    // [32    33    33    27    27]
-    // [31    34    34    27    27]
-    // [36    36    34    18    18]
-    for (int i = 0; i < 20 * num * channels; i += 20) {
-      EXPECT_EQ(blob_top_->cpu_data()[i +  0], 35);
-      EXPECT_EQ(blob_top_->cpu_data()[i +  1], 32);
-      EXPECT_EQ(blob_top_->cpu_data()[i +  2], 26);
-      EXPECT_EQ(blob_top_->cpu_data()[i +  3], 27);
-      EXPECT_EQ(blob_top_->cpu_data()[i +  4], 27);
-      EXPECT_EQ(blob_top_->cpu_data()[i +  5], 32);
-      EXPECT_EQ(blob_top_->cpu_data()[i +  6], 33);
-      EXPECT_EQ(blob_top_->cpu_data()[i +  7], 33);
-      EXPECT_EQ(blob_top_->cpu_data()[i +  8], 27);
-      EXPECT_EQ(blob_top_->cpu_data()[i +  9], 27);
-      EXPECT_EQ(blob_top_->cpu_data()[i + 10], 31);
-      EXPECT_EQ(blob_top_->cpu_data()[i + 11], 34);
-      EXPECT_EQ(blob_top_->cpu_data()[i + 12], 34);
-      EXPECT_EQ(blob_top_->cpu_data()[i + 13], 27);
-      EXPECT_EQ(blob_top_->cpu_data()[i + 14], 27);
-      EXPECT_EQ(blob_top_->cpu_data()[i + 15], 36);
-      EXPECT_EQ(blob_top_->cpu_data()[i + 16], 36);
-      EXPECT_EQ(blob_top_->cpu_data()[i + 17], 34);
-      EXPECT_EQ(blob_top_->cpu_data()[i + 18], 18);
-      EXPECT_EQ(blob_top_->cpu_data()[i + 19], 18);
-    }
-    if (blob_top_vec_.size() > 1) {
-        // [ 1     8     4    17    17]
-        // [ 8    21    21    17    17]
-        // [13    27    27    17    17]
-        // [32    32    27    35    35]
-      for (int i = 0; i < 20 * num * channels; i += 20) {
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i +  0],  0);
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i +  1],  7);
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i +  2],  3);
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i +  3], 16);
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i +  4], 16);
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i +  5],  7);
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i +  6], 20);
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i +  7], 20);
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i +  8], 16);
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i +  9], 16);
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i + 10], 12);
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i + 11], 26);
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i + 12], 26);
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i + 13], 16);
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i + 14], 16);
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i + 15], 31);
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i + 16], 31);
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i + 17], 26);
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i + 18], 34);
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i + 19], 34);
-      }
-    }
-  }
-  // Test for rectangular pooling layer with kernel_w > kernel_h
-  void TestForwardRectWide() {
-    LayerParameter layer_param;
-    PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
-    pooling_param->set_kernel_h(2);
-    pooling_param->set_kernel_w(3);
-    pooling_param->set_pool(PoolingParameter_PoolMethod_MAX);
-    const int num = 2;
-    const int channels = 2;
-    blob_bottom_->Reshape(num, channels, 6, 6);
-    // Input: 2x 2 channels of:
-    // [35     1     6    26    19    24]
-    // [ 3    32     7    21    23    25]
-    // [31     9     2    22    27    20]
-    // [ 8    28    33    17    10    15]
-    // [30     5    34    12    14    16]
-    // [ 4    36    29    13    18    11]
-    // (this is generated by magic(6) in MATLAB)
-    for (int i = 0; i < 36 * num * channels; i += 36) {
-      blob_bottom_->mutable_cpu_data()[i +  0] = 35;
-      blob_bottom_->mutable_cpu_data()[i +  1] = 1;
-      blob_bottom_->mutable_cpu_data()[i +  2] = 6;
-      blob_bottom_->mutable_cpu_data()[i +  3] = 26;
-      blob_bottom_->mutable_cpu_data()[i +  4] = 19;
-      blob_bottom_->mutable_cpu_data()[i +  5] = 24;
-      blob_bottom_->mutable_cpu_data()[i +  6] = 3;
-      blob_bottom_->mutable_cpu_data()[i +  7] = 32;
-      blob_bottom_->mutable_cpu_data()[i +  8] = 7;
-      blob_bottom_->mutable_cpu_data()[i +  9] = 21;
-      blob_bottom_->mutable_cpu_data()[i + 10] = 23;
-      blob_bottom_->mutable_cpu_data()[i + 11] = 25;
-      blob_bottom_->mutable_cpu_data()[i + 12] = 31;
-      blob_bottom_->mutable_cpu_data()[i + 13] = 9;
-      blob_bottom_->mutable_cpu_data()[i + 14] = 2;
-      blob_bottom_->mutable_cpu_data()[i + 15] = 22;
-      blob_bottom_->mutable_cpu_data()[i + 16] = 27;
-      blob_bottom_->mutable_cpu_data()[i + 17] = 20;
-      blob_bottom_->mutable_cpu_data()[i + 18] = 8;
-      blob_bottom_->mutable_cpu_data()[i + 19] = 28;
-      blob_bottom_->mutable_cpu_data()[i + 20] = 33;
-      blob_bottom_->mutable_cpu_data()[i + 21] = 17;
-      blob_bottom_->mutable_cpu_data()[i + 22] = 10;
-      blob_bottom_->mutable_cpu_data()[i + 23] = 15;
-      blob_bottom_->mutable_cpu_data()[i + 24] = 30;
-      blob_bottom_->mutable_cpu_data()[i + 25] = 5;
-      blob_bottom_->mutable_cpu_data()[i + 26] = 34;
-      blob_bottom_->mutable_cpu_data()[i + 27] = 12;
-      blob_bottom_->mutable_cpu_data()[i + 28] = 14;
-      blob_bottom_->mutable_cpu_data()[i + 29] = 16;
-      blob_bottom_->mutable_cpu_data()[i + 30] = 4;
-      blob_bottom_->mutable_cpu_data()[i + 31] = 36;
-      blob_bottom_->mutable_cpu_data()[i + 32] = 29;
-      blob_bottom_->mutable_cpu_data()[i + 33] = 13;
-      blob_bottom_->mutable_cpu_data()[i + 34] = 18;
-      blob_bottom_->mutable_cpu_data()[i + 35] = 11;
-    }
-    CuDNNPoolingLayer<Dtype> layer(layer_param);
-    layer.SetUp(blob_bottom_vec_, blob_top_vec_);
-    EXPECT_EQ(blob_top_->num(), num);
-    EXPECT_EQ(blob_top_->channels(), channels);
-    EXPECT_EQ(blob_top_->height(), 5);
-    EXPECT_EQ(blob_top_->width(), 4);
-    if (blob_top_vec_.size() > 1) {
-      EXPECT_EQ(blob_top_mask_->num(), num);
-      EXPECT_EQ(blob_top_mask_->channels(), channels);
-      EXPECT_EQ(blob_top_mask_->height(), 5);
-      EXPECT_EQ(blob_top_mask_->width(), 4);
-    }
-    layer.Forward(blob_bottom_vec_, blob_top_vec_);
-    // Expected output: 2x 2 channels of:
-    // [35    32    26    26]
-    // [32    32    27    27]
-    // [33    33    33    27]
-    // [34    34    34    17]
-    // [36    36    34    18]
-    for (int i = 0; i < 20 * num * channels; i += 20) {
-      EXPECT_EQ(blob_top_->cpu_data()[i +  0], 35);
-      EXPECT_EQ(blob_top_->cpu_data()[i +  1], 32);
-      EXPECT_EQ(blob_top_->cpu_data()[i +  2], 26);
-      EXPECT_EQ(blob_top_->cpu_data()[i +  3], 26);
-      EXPECT_EQ(blob_top_->cpu_data()[i +  4], 32);
-      EXPECT_EQ(blob_top_->cpu_data()[i +  5], 32);
-      EXPECT_EQ(blob_top_->cpu_data()[i +  6], 27);
-      EXPECT_EQ(blob_top_->cpu_data()[i +  7], 27);
-      EXPECT_EQ(blob_top_->cpu_data()[i +  8], 33);
-      EXPECT_EQ(blob_top_->cpu_data()[i +  9], 33);
-      EXPECT_EQ(blob_top_->cpu_data()[i + 10], 33);
-      EXPECT_EQ(blob_top_->cpu_data()[i + 11], 27);
-      EXPECT_EQ(blob_top_->cpu_data()[i + 12], 34);
-      EXPECT_EQ(blob_top_->cpu_data()[i + 13], 34);
-      EXPECT_EQ(blob_top_->cpu_data()[i + 14], 34);
-      EXPECT_EQ(blob_top_->cpu_data()[i + 15], 17);
-      EXPECT_EQ(blob_top_->cpu_data()[i + 16], 36);
-      EXPECT_EQ(blob_top_->cpu_data()[i + 17], 36);
-      EXPECT_EQ(blob_top_->cpu_data()[i + 18], 34);
-      EXPECT_EQ(blob_top_->cpu_data()[i + 19], 18);
-    }
-    if (blob_top_vec_.size() > 1) {
-        // [ 1     8     4     4]
-        // [ 8     8    17    17]
-        // [21    21    21    17]
-        // [27    27    27    22]
-        // [32    32    27    35]
-      for (int i = 0; i < 20 * num * channels; i += 20) {
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i +  0],  0);
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i +  1],  7);
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i +  2],  3);
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i +  3],  3);
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i +  4],  7);
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i +  5],  7);
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i +  6], 16);
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i +  7], 16);
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i +  8], 20);
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i +  9], 20);
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i + 10], 20);
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i + 11], 16);
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i + 12], 26);
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i + 13], 26);
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i + 14], 26);
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i + 15], 21);
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i + 16], 31);
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i + 17], 31);
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i + 18], 26);
-        EXPECT_EQ(blob_top_mask_->cpu_data()[i + 19], 34);
-      }
-    }
-  }
-};
-
-TYPED_TEST_CASE(CuDNNPoolingLayerTest, TestDtypes);
-
-TYPED_TEST(CuDNNPoolingLayerTest, TestSetupCuDNN) {
-  LayerParameter layer_param;
-  PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
-  pooling_param->set_kernel_size(3);
-  pooling_param->set_stride(2);
-  CuDNNPoolingLayer<TypeParam> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  EXPECT_EQ(this->blob_top_->num(), this->blob_bottom_->num());
-  EXPECT_EQ(this->blob_top_->channels(), this->blob_bottom_->channels());
-  EXPECT_EQ(this->blob_top_->height(), 3);
-  EXPECT_EQ(this->blob_top_->width(), 2);
-}
-
-TYPED_TEST(CuDNNPoolingLayerTest, TestSetupPaddedCuDNN) {
-  LayerParameter layer_param;
-  PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
-  pooling_param->set_kernel_size(3);
-  pooling_param->set_stride(2);
-  pooling_param->set_pad(1);
-  pooling_param->set_pool(PoolingParameter_PoolMethod_AVE);
-  CuDNNPoolingLayer<TypeParam> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  EXPECT_EQ(this->blob_top_->num(), this->blob_bottom_->num());
-  EXPECT_EQ(this->blob_top_->channels(), this->blob_bottom_->channels());
-  EXPECT_EQ(this->blob_top_->height(), 4);
-  EXPECT_EQ(this->blob_top_->width(), 3);
-}
-
-/*
-TYPED_TEST(CuDNNPoolingLayerTest, PrintBackwardCuDNN) {
-  LayerParameter layer_param;
-  layer_param.set_kernelsize(3);
-  layer_param.set_stride(2);
-  layer_param.set_pool(LayerParameter_PoolMethod_MAX);
-  CuDNNPoolingLayer<TypeParam> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  for (int i = 0; i < this->blob_bottom_->count(); ++i) {
-    cout << "bottom data " << i << " " << this->blob_bottom_->cpu_data()[i] << endl;
-  }
-  for (int i = 0; i < this->blob_top_->count(); ++i) {
-    cout << "top data " << i << " " << this->blob_top_->cpu_data()[i] << endl;
-  }
-
-  for (int i = 0; i < this->blob_top_->count(); ++i) {
-    this->blob_top_->mutable_cpu_diff()[i] = i;
-  }
-  layer.Backward(this->blob_top_vec_, true, this->blob_bottom_vec_);
-  for (int i = 0; i < this->blob_bottom_->count(); ++i) {
-    cout << "bottom diff " << i << " " << this->blob_bottom_->cpu_diff()[i] << endl;
-  }
-}
-*/
-
-TYPED_TEST(CuDNNPoolingLayerTest, TestForwardMaxCuDNN) {
-  this->TestForwardSquare();
-  this->TestForwardRectHigh();
-  this->TestForwardRectWide();
-}
-
-// Currently, cuDNN does not support a top mask, so we comment this and
-// the corresponding backward test.
-/*
-TYPED_TEST(CuDNNPoolingLayerTest, TestForwardMaxTopMaskCuDNN) {
-  this->blob_top_vec_.push_back(this->blob_top_mask_);
-  this->TestForwardSquare();
-  this->TestForwardRectHigh();
-  this->TestForwardRectWide();
-}
-*/
-
-TYPED_TEST(CuDNNPoolingLayerTest, TestGradientMaxCuDNN) {
-  for (int kernel_h = 3; kernel_h <= 4; kernel_h++) {
-    for (int kernel_w = 3; kernel_w <= 4; kernel_w++) {
-      LayerParameter layer_param;
-      PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
-      pooling_param->set_kernel_h(kernel_h);
-      pooling_param->set_kernel_w(kernel_w);
-      pooling_param->set_stride(2);
-      // currenty, cuDNN pooling does not support padding
-      pooling_param->set_pad(0);
-      pooling_param->set_pool(PoolingParameter_PoolMethod_MAX);
-      CuDNNPoolingLayer<TypeParam> layer(layer_param);
-      GradientChecker<TypeParam> checker(1e-4, 1e-2);
-      checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-          this->blob_top_vec_);
-    }
-  }
-}
-
-TYPED_TEST(CuDNNPoolingLayerTest, TestForwardMaxPaddedCuDNN) {
-  LayerParameter layer_param;
-  PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
-  pooling_param->set_kernel_size(3);
-  pooling_param->set_stride(2);
-  pooling_param->set_pad(2);
-  pooling_param->set_pool(PoolingParameter_PoolMethod_MAX);
-  this->blob_bottom_->Reshape(1, 1, 3, 3);
-  // Input:
-  //     [ 1 2 4 ]
-  //     [ 2 3 2 ]
-  //     [ 4 2 1 ]
-  this->blob_bottom_->mutable_cpu_data()[0] = 1;
-  this->blob_bottom_->mutable_cpu_data()[1] = 2;
-  this->blob_bottom_->mutable_cpu_data()[2] = 4;
-  this->blob_bottom_->mutable_cpu_data()[3] = 2;
-  this->blob_bottom_->mutable_cpu_data()[4] = 3;
-  this->blob_bottom_->mutable_cpu_data()[5] = 2;
-  this->blob_bottom_->mutable_cpu_data()[6] = 4;
-  this->blob_bottom_->mutable_cpu_data()[7] = 2;
-  this->blob_bottom_->mutable_cpu_data()[8] = 1;
-  CuDNNPoolingLayer<TypeParam> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  EXPECT_EQ(this->blob_top_->num(), 1);
-  EXPECT_EQ(this->blob_top_->channels(), 1);
-  EXPECT_EQ(this->blob_top_->height(), 3);
-  EXPECT_EQ(this->blob_top_->width(), 3);
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  TypeParam epsilon = 1e-8;
-  // Output:
-  //     [ 1 4 4 ]
-  //     [ 4 4 4 ]
-  //     [ 4 4 1 ]
-  EXPECT_NEAR(this->blob_top_->cpu_data()[0], 1, epsilon);
-  EXPECT_NEAR(this->blob_top_->cpu_data()[1], 4, epsilon);
-  EXPECT_NEAR(this->blob_top_->cpu_data()[2], 4, epsilon);
-  EXPECT_NEAR(this->blob_top_->cpu_data()[3], 4, epsilon);
-  EXPECT_NEAR(this->blob_top_->cpu_data()[4], 4, epsilon);
-  EXPECT_NEAR(this->blob_top_->cpu_data()[5], 4, epsilon);
-  EXPECT_NEAR(this->blob_top_->cpu_data()[6], 4, epsilon);
-  EXPECT_NEAR(this->blob_top_->cpu_data()[7], 4, epsilon);
-  EXPECT_NEAR(this->blob_top_->cpu_data()[8], 1, epsilon);
-}
-
-/*
-TYPED_TEST(CuDNNPoolingLayerTest, TestGradientMaxTopMaskCuDNN) {
-  for (int kernel_h = 3; kernel_h <= 4; kernel_h++) {
-    for (int kernel_w = 3; kernel_w <= 4; kernel_w++) {
-      LayerParameter layer_param;
-      PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
-      pooling_param->set_kernel_h(kernel_h);
-      pooling_param->set_kernel_w(kernel_w);
-      pooling_param->set_stride(2);
-      pooling_param->set_pool(PoolingParameter_PoolMethod_MAX);
-      this->blob_top_vec_.push_back(this->blob_top_mask_);
-      CuDNNPoolingLayer<TypeParam> layer(layer_param);
-      GradientChecker<TypeParam> checker(1e-4, 1e-2);
-      checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-          this->blob_top_vec_);
-      this->blob_top_vec_.pop_back();
-    }
-  }
-}
-*/
-
-TYPED_TEST(CuDNNPoolingLayerTest, TestForwardAveCuDNN) {
-  LayerParameter layer_param;
-  PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
-  pooling_param->set_kernel_size(3);
-  pooling_param->set_stride(1);
-  // Currently, cuDNN pooling does not support padding, so we use
-  // a simplified version of this test.
-  pooling_param->set_pad(0);
-  pooling_param->set_pool(PoolingParameter_PoolMethod_AVE);
-  this->blob_bottom_->Reshape(1, 1, 3, 3);
-  FillerParameter filler_param;
-  filler_param.set_value(TypeParam(2));
-  ConstantFiller<TypeParam> filler(filler_param);
-  filler.Fill(this->blob_bottom_);
-  CuDNNPoolingLayer<TypeParam> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  EXPECT_EQ(this->blob_top_->num(), 1);
-  EXPECT_EQ(this->blob_top_->channels(), 1);
-  EXPECT_EQ(this->blob_top_->height(), 1);
-  EXPECT_EQ(this->blob_top_->width(), 1);
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  TypeParam epsilon = 1e-5;
-  EXPECT_NEAR(this->blob_top_->cpu_data()[0], 2.0, epsilon);
-}
-
-TYPED_TEST(CuDNNPoolingLayerTest, TestGradientAveCuDNN) {
-  for (int kernel_h = 3; kernel_h <= 4; kernel_h++) {
-    for (int kernel_w = 3; kernel_w <= 4; kernel_w++) {
-      LayerParameter layer_param;
-      PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
-      pooling_param->set_kernel_h(kernel_h);
-      pooling_param->set_kernel_w(kernel_w);
-      pooling_param->set_stride(2);
-      pooling_param->set_pool(PoolingParameter_PoolMethod_AVE);
-      CuDNNPoolingLayer<TypeParam> layer(layer_param);
-      GradientChecker<TypeParam> checker(1e-2, 1e-2);
-      checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-          this->blob_top_vec_);
-    }
-  }
-}
-
-TYPED_TEST(CuDNNPoolingLayerTest, TestGradientAvePaddedCuDNN) {
-  for (int kernel_h = 3; kernel_h <= 4; kernel_h++) {
-    for (int kernel_w = 3; kernel_w <= 4; kernel_w++) {
-      LayerParameter layer_param;
-      PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
-      pooling_param->set_kernel_h(kernel_h);
-      pooling_param->set_kernel_w(kernel_w);
-      pooling_param->set_stride(2);
-      pooling_param->set_pad(2);
-      pooling_param->set_pool(PoolingParameter_PoolMethod_AVE);
-      CuDNNPoolingLayer<TypeParam> layer(layer_param);
-      GradientChecker<TypeParam> checker(1e-2, 1e-2);
-      checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-          this->blob_top_vec_);
-    }
-  }
-}
-
-#endif
-
-}  // namespace caffe
diff --git a/src/caffe/test/test_power_layer.cpp b/src/caffe/test/test_power_layer.cpp
deleted file mode 100644
index 1aa587a..0000000
--- a/src/caffe/test/test_power_layer.cpp
+++ /dev/null
@@ -1,170 +0,0 @@
-#include <algorithm>
-#include <vector>
-
-#include "gtest/gtest.h"
-
-#include "caffe/blob.hpp"
-#include "caffe/common.hpp"
-#include "caffe/filler.hpp"
-#include "caffe/layers/power_layer.hpp"
-
-#include "caffe/test/test_caffe_main.hpp"
-#include "caffe/test/test_gradient_check_util.hpp"
-
-namespace caffe {
-
-template <typename TypeParam>
-class PowerLayerTest : public MultiDeviceTest<TypeParam> {
-  typedef typename TypeParam::Dtype Dtype;
-
- protected:
-  PowerLayerTest()
-      : blob_bottom_(new Blob<Dtype>(2, 3, 4, 5)),
-        blob_top_(new Blob<Dtype>()) {
-    Caffe::set_random_seed(1701);
-    // fill the values
-    FillerParameter filler_param;
-    GaussianFiller<Dtype> filler(filler_param);
-    filler.Fill(this->blob_bottom_);
-    blob_bottom_vec_.push_back(blob_bottom_);
-    blob_top_vec_.push_back(blob_top_);
-  }
-  virtual ~PowerLayerTest() { delete blob_bottom_; delete blob_top_; }
-
-  void TestForward(Dtype power, Dtype scale, Dtype shift) {
-    LayerParameter layer_param;
-    layer_param.mutable_power_param()->set_power(power);
-    layer_param.mutable_power_param()->set_scale(scale);
-    layer_param.mutable_power_param()->set_shift(shift);
-    PowerLayer<Dtype> layer(layer_param);
-    layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-    layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-    // Now, check values
-    const Dtype* bottom_data = this->blob_bottom_->cpu_data();
-    const Dtype* top_data = this->blob_top_->cpu_data();
-    const Dtype min_precision = 1e-5;
-    for (int i = 0; i < this->blob_bottom_->count(); ++i) {
-      Dtype expected_value = pow(shift + scale * bottom_data[i], power);
-      if (power == Dtype(0) || power == Dtype(1) || power == Dtype(2)) {
-        EXPECT_FALSE(isnan(top_data[i]));
-      }
-      if (isnan(expected_value)) {
-        EXPECT_TRUE(isnan(top_data[i]));
-      } else {
-        Dtype precision = std::max(
-          Dtype(std::abs(expected_value * Dtype(1e-4))), min_precision);
-        EXPECT_NEAR(expected_value, top_data[i], precision);
-      }
-    }
-  }
-
-  void TestBackward(Dtype power, Dtype scale, Dtype shift) {
-    LayerParameter layer_param;
-    layer_param.mutable_power_param()->set_power(power);
-    layer_param.mutable_power_param()->set_scale(scale);
-    layer_param.mutable_power_param()->set_shift(shift);
-    PowerLayer<Dtype> layer(layer_param);
-    if (power != Dtype(0) && power != Dtype(1) && power != Dtype(2)) {
-      // Avoid NaNs by forcing (shift + scale * x) >= 0
-      Dtype* bottom_data = this->blob_bottom_->mutable_cpu_data();
-      Dtype min_value = -shift / scale;
-      for (int i = 0; i < this->blob_bottom_->count(); ++i) {
-        if (bottom_data[i] < min_value) {
-          bottom_data[i] = min_value + (min_value - bottom_data[i]);
-        }
-      }
-    }
-    GradientChecker<Dtype> checker(1e-3, 1e-2, 1701, 0., 0.01);
-    checker.CheckGradientEltwise(&layer, this->blob_bottom_vec_,
-        this->blob_top_vec_);
-  }
-
-  Blob<Dtype>* const blob_bottom_;
-  Blob<Dtype>* const blob_top_;
-  vector<Blob<Dtype>*> blob_bottom_vec_;
-  vector<Blob<Dtype>*> blob_top_vec_;
-};
-
-TYPED_TEST_CASE(PowerLayerTest, TestDtypesAndDevices);
-
-TYPED_TEST(PowerLayerTest, TestPower) {
-  typedef typename TypeParam::Dtype Dtype;
-  Dtype power = 0.37;
-  Dtype scale = 0.83;
-  Dtype shift = -2.4;
-  this->TestForward(power, scale, shift);
-}
-
-TYPED_TEST(PowerLayerTest, TestPowerGradient) {
-  typedef typename TypeParam::Dtype Dtype;
-  Dtype power = 0.37;
-  Dtype scale = 0.83;
-  Dtype shift = -2.4;
-  this->TestBackward(power, scale, shift);
-}
-
-TYPED_TEST(PowerLayerTest, TestPowerGradientShiftZero) {
-  typedef typename TypeParam::Dtype Dtype;
-  Dtype power = 0.37;
-  Dtype scale = 0.83;
-  Dtype shift = 0.0;
-  this->TestBackward(power, scale, shift);
-}
-
-TYPED_TEST(PowerLayerTest, TestPowerZero) {
-  typedef typename TypeParam::Dtype Dtype;
-  Dtype power = 0.0;
-  Dtype scale = 0.83;
-  Dtype shift = -2.4;
-  this->TestForward(power, scale, shift);
-}
-
-TYPED_TEST(PowerLayerTest, TestPowerZeroGradient) {
-  typedef typename TypeParam::Dtype Dtype;
-  Dtype power = 0.0;
-  Dtype scale = 0.83;
-  Dtype shift = -2.4;
-  this->TestBackward(power, scale, shift);
-}
-
-TYPED_TEST(PowerLayerTest, TestPowerOne) {
-  typedef typename TypeParam::Dtype Dtype;
-  Dtype power = 1.0;
-  Dtype scale = 0.83;
-  Dtype shift = -2.4;
-  this->TestForward(power, scale, shift);
-}
-
-TYPED_TEST(PowerLayerTest, TestPowerOneGradient) {
-  typedef typename TypeParam::Dtype Dtype;
-  Dtype power = 1.0;
-  Dtype scale = 0.83;
-  Dtype shift = -2.4;
-  this->TestBackward(power, scale, shift);
-}
-
-TYPED_TEST(PowerLayerTest, TestPowerTwo) {
-  typedef typename TypeParam::Dtype Dtype;
-  Dtype power = 2.0;
-  Dtype scale = 0.34;
-  Dtype shift = -2.4;
-  this->TestForward(power, scale, shift);
-}
-
-TYPED_TEST(PowerLayerTest, TestPowerTwoGradient) {
-  typedef typename TypeParam::Dtype Dtype;
-  Dtype power = 2.0;
-  Dtype scale = 0.83;
-  Dtype shift = -2.4;
-  this->TestBackward(power, scale, shift);
-}
-
-TYPED_TEST(PowerLayerTest, TestPowerTwoScaleHalfGradient) {
-  typedef typename TypeParam::Dtype Dtype;
-  Dtype power = 2.0;
-  Dtype scale = 0.5;
-  Dtype shift = -2.4;
-  this->TestBackward(power, scale, shift);
-}
-
-}  // namespace caffe
diff --git a/src/caffe/test/test_protobuf.cpp b/src/caffe/test/test_protobuf.cpp
deleted file mode 100644
index 01de461..0000000
--- a/src/caffe/test/test_protobuf.cpp
+++ /dev/null
@@ -1,29 +0,0 @@
-// This is simply a script that tries serializing protocol buffer in text
-// format. Nothing special here and no actual code is being tested.
-#include <string>
-
-#include "google/protobuf/text_format.h"
-#include "gtest/gtest.h"
-
-#include "caffe/proto/caffe.pb.h"
-
-#include "caffe/test/test_caffe_main.hpp"
-
-namespace caffe {
-
-class ProtoTest : public ::testing::Test {};
-
-TEST_F(ProtoTest, TestSerialization) {
-  LayerParameter param;
-  param.set_name("test");
-  param.set_type("Test");
-  std::cout << "Printing in binary format." << std::endl;
-  std::cout << param.SerializeAsString() << std::endl;
-  std::cout << "Printing in text format." << std::endl;
-  std::string str;
-  google::protobuf::TextFormat::PrintToString(param, &str);
-  std::cout << str << std::endl;
-  EXPECT_TRUE(true);
-}
-
-}  // namespace caffe
diff --git a/src/caffe/test/test_random_number_generator.cpp b/src/caffe/test/test_random_number_generator.cpp
deleted file mode 100644
index 833b004..0000000
--- a/src/caffe/test/test_random_number_generator.cpp
+++ /dev/null
@@ -1,520 +0,0 @@
-#include <cmath>
-
-#include "gtest/gtest.h"
-
-#include "caffe/common.hpp"
-#include "caffe/syncedmem.hpp"
-#include "caffe/util/math_functions.hpp"
-
-#include "caffe/test/test_caffe_main.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-class RandomNumberGeneratorTest : public ::testing::Test {
- protected:
-  RandomNumberGeneratorTest()
-     : mean_bound_multiplier_(3.8),  // ~99.99% confidence for test failure.
-       sample_size_(10000),
-       seed_(1701),
-       data_(new SyncedMemory(sample_size_ * sizeof(Dtype))),
-       data_2_(new SyncedMemory(sample_size_ * sizeof(Dtype))),
-       int_data_(new SyncedMemory(sample_size_ * sizeof(int))),
-       int_data_2_(new SyncedMemory(sample_size_ * sizeof(int))) {}
-
-  virtual void SetUp() {
-    Caffe::set_random_seed(this->seed_);
-  }
-
-  Dtype sample_mean(const Dtype* const seqs, const int sample_size) {
-    Dtype sum = 0;
-    for (int i = 0; i < sample_size; ++i) {
-      sum += seqs[i];
-    }
-    return sum / sample_size;
-  }
-
-  Dtype sample_mean(const Dtype* const seqs) {
-    return sample_mean(seqs, sample_size_);
-  }
-
-  Dtype sample_mean(const int* const seqs, const int sample_size) {
-    Dtype sum = 0;
-    for (int i = 0; i < sample_size; ++i) {
-      sum += Dtype(seqs[i]);
-    }
-    return sum / sample_size;
-  }
-
-  Dtype sample_mean(const int* const seqs) {
-    return sample_mean(seqs, sample_size_);
-  }
-
-  Dtype mean_bound(const Dtype std, const int sample_size) {
-    return mean_bound_multiplier_ * std / sqrt(static_cast<Dtype>(sample_size));
-  }
-
-  Dtype mean_bound(const Dtype std) {
-    return mean_bound(std, sample_size_);
-  }
-
-  void RngGaussianFill(const Dtype mu, const Dtype sigma, void* cpu_data) {
-    Dtype* rng_data = static_cast<Dtype*>(cpu_data);
-    caffe_rng_gaussian(sample_size_, mu, sigma, rng_data);
-  }
-
-  void RngGaussianChecks(const Dtype mu, const Dtype sigma,
-                         const void* cpu_data, const Dtype sparse_p = 0) {
-    const Dtype* rng_data = static_cast<const Dtype*>(cpu_data);
-    const Dtype true_mean = mu;
-    const Dtype true_std = sigma;
-    // Check that sample mean roughly matches true mean.
-    const Dtype bound = this->mean_bound(true_std);
-    const Dtype sample_mean = this->sample_mean(
-        static_cast<const Dtype*>(cpu_data));
-    EXPECT_NEAR(sample_mean, true_mean, bound);
-    // Check that roughly half the samples are above the true mean.
-    int num_above_mean = 0;
-    int num_below_mean = 0;
-    int num_mean = 0;
-    int num_nan = 0;
-    for (int i = 0; i < sample_size_; ++i) {
-      if (rng_data[i] > true_mean) {
-        ++num_above_mean;
-      } else if (rng_data[i] < true_mean) {
-        ++num_below_mean;
-      } else if (rng_data[i] == true_mean) {
-        ++num_mean;
-      } else {
-        ++num_nan;
-      }
-    }
-    EXPECT_EQ(0, num_nan);
-    if (sparse_p == Dtype(0)) {
-      EXPECT_EQ(0, num_mean);
-    }
-    const Dtype sample_p_above_mean =
-        static_cast<Dtype>(num_above_mean) / sample_size_;
-    const Dtype bernoulli_p = (1 - sparse_p) * 0.5;
-    const Dtype bernoulli_std = sqrt(bernoulli_p * (1 - bernoulli_p));
-    const Dtype bernoulli_bound = this->mean_bound(bernoulli_std);
-    EXPECT_NEAR(bernoulli_p, sample_p_above_mean, bernoulli_bound);
-  }
-
-  void RngUniformFill(const Dtype lower, const Dtype upper, void* cpu_data) {
-    CHECK_GE(upper, lower);
-    Dtype* rng_data = static_cast<Dtype*>(cpu_data);
-    caffe_rng_uniform(sample_size_, lower, upper, rng_data);
-  }
-
-  void RngUniformChecks(const Dtype lower, const Dtype upper,
-                        const void* cpu_data, const Dtype sparse_p = 0) {
-    const Dtype* rng_data = static_cast<const Dtype*>(cpu_data);
-    const Dtype true_mean = (lower + upper) / 2;
-    const Dtype true_std = (upper - lower) / sqrt(12);
-    // Check that sample mean roughly matches true mean.
-    const Dtype bound = this->mean_bound(true_std);
-    const Dtype sample_mean = this->sample_mean(rng_data);
-    EXPECT_NEAR(sample_mean, true_mean, bound);
-    // Check that roughly half the samples are above the true mean, and none are
-    // above upper or below lower.
-    int num_above_mean = 0;
-    int num_below_mean = 0;
-    int num_mean = 0;
-    int num_nan = 0;
-    int num_above_upper = 0;
-    int num_below_lower = 0;
-    for (int i = 0; i < sample_size_; ++i) {
-      if (rng_data[i] > true_mean) {
-        ++num_above_mean;
-      } else if (rng_data[i] < true_mean) {
-        ++num_below_mean;
-      } else if (rng_data[i] == true_mean) {
-        ++num_mean;
-      } else {
-        ++num_nan;
-      }
-      if (rng_data[i] > upper) {
-        ++num_above_upper;
-      } else if (rng_data[i] < lower) {
-        ++num_below_lower;
-      }
-    }
-    EXPECT_EQ(0, num_nan);
-    EXPECT_EQ(0, num_above_upper);
-    EXPECT_EQ(0, num_below_lower);
-    if (sparse_p == Dtype(0)) {
-      EXPECT_EQ(0, num_mean);
-    }
-    const Dtype sample_p_above_mean =
-        static_cast<Dtype>(num_above_mean) / sample_size_;
-    const Dtype bernoulli_p = (1 - sparse_p) * 0.5;
-    const Dtype bernoulli_std = sqrt(bernoulli_p * (1 - bernoulli_p));
-    const Dtype bernoulli_bound = this->mean_bound(bernoulli_std);
-    EXPECT_NEAR(bernoulli_p, sample_p_above_mean, bernoulli_bound);
-  }
-
-  void RngBernoulliFill(const Dtype p, void* cpu_data) {
-    int* rng_data = static_cast<int*>(cpu_data);
-    caffe_rng_bernoulli(sample_size_, p, rng_data);
-  }
-
-  void RngBernoulliChecks(const Dtype p, const void* cpu_data) {
-    const int* rng_data = static_cast<const int*>(cpu_data);
-    const Dtype true_mean = p;
-    const Dtype true_std = sqrt(p * (1 - p));
-    const Dtype bound = this->mean_bound(true_std);
-    const Dtype sample_mean = this->sample_mean(rng_data);
-    EXPECT_NEAR(sample_mean, true_mean, bound);
-  }
-
-#ifndef CPU_ONLY
-
-  void RngGaussianFillGPU(const Dtype mu, const Dtype sigma, void* gpu_data) {
-    Dtype* rng_data = static_cast<Dtype*>(gpu_data);
-    caffe_gpu_rng_gaussian(sample_size_, mu, sigma, rng_data);
-  }
-
-  void RngUniformFillGPU(const Dtype lower, const Dtype upper, void* gpu_data) {
-    CHECK_GE(upper, lower);
-    Dtype* rng_data = static_cast<Dtype*>(gpu_data);
-    caffe_gpu_rng_uniform(sample_size_, lower, upper, rng_data);
-  }
-
-  // Fills with uniform integers in [0, UINT_MAX] using 2 argument form of
-  // caffe_gpu_rng_uniform.
-  void RngUniformIntFillGPU(void* gpu_data) {
-    unsigned int* rng_data = static_cast<unsigned int*>(gpu_data);
-    caffe_gpu_rng_uniform(sample_size_, rng_data);
-  }
-
-#endif
-
-  int num_above_mean;
-  int num_below_mean;
-
-  Dtype mean_bound_multiplier_;
-
-  size_t sample_size_;
-  uint32_t seed_;
-
-  shared_ptr<SyncedMemory> data_;
-  shared_ptr<SyncedMemory> data_2_;
-  shared_ptr<SyncedMemory> int_data_;
-  shared_ptr<SyncedMemory> int_data_2_;
-};
-
-TYPED_TEST_CASE(RandomNumberGeneratorTest, TestDtypes);
-
-TYPED_TEST(RandomNumberGeneratorTest, TestRngGaussian) {
-  const TypeParam mu = 0;
-  const TypeParam sigma = 1;
-  void* gaussian_data = this->data_->mutable_cpu_data();
-  this->RngGaussianFill(mu, sigma, gaussian_data);
-  this->RngGaussianChecks(mu, sigma, gaussian_data);
-}
-
-
-TYPED_TEST(RandomNumberGeneratorTest, TestRngGaussian2) {
-  const TypeParam mu = -2;
-  const TypeParam sigma = 3;
-  void* gaussian_data = this->data_->mutable_cpu_data();
-  this->RngGaussianFill(mu, sigma, gaussian_data);
-  this->RngGaussianChecks(mu, sigma, gaussian_data);
-}
-
-
-TYPED_TEST(RandomNumberGeneratorTest, TestRngUniform) {
-  const TypeParam lower = 0;
-  const TypeParam upper = 1;
-  void* uniform_data = this->data_->mutable_cpu_data();
-  this->RngUniformFill(lower, upper, uniform_data);
-  this->RngUniformChecks(lower, upper, uniform_data);
-}
-
-
-TYPED_TEST(RandomNumberGeneratorTest, TestRngUniform2) {
-  const TypeParam lower = -7.3;
-  const TypeParam upper = -2.3;
-  void* uniform_data = this->data_->mutable_cpu_data();
-  this->RngUniformFill(lower, upper, uniform_data);
-  this->RngUniformChecks(lower, upper, uniform_data);
-}
-
-
-TYPED_TEST(RandomNumberGeneratorTest, TestRngBernoulli) {
-  const TypeParam p = 0.3;
-  void* bernoulli_data = this->int_data_->mutable_cpu_data();
-  this->RngBernoulliFill(p, bernoulli_data);
-  this->RngBernoulliChecks(p, bernoulli_data);
-}
-
-
-TYPED_TEST(RandomNumberGeneratorTest, TestRngBernoulli2) {
-  const TypeParam p = 0.9;
-  void* bernoulli_data = this->int_data_->mutable_cpu_data();
-  this->RngBernoulliFill(p, bernoulli_data);
-  this->RngBernoulliChecks(p, bernoulli_data);
-}
-
-
-TYPED_TEST(RandomNumberGeneratorTest, TestRngGaussianTimesGaussian) {
-  const TypeParam mu = 0;
-  const TypeParam sigma = 1;
-
-  // Sample from 0 mean Gaussian.
-  TypeParam* gaussian_data_1 =
-      static_cast<TypeParam*>(this->data_->mutable_cpu_data());
-  this->RngGaussianFill(mu, sigma, gaussian_data_1);
-
-  // Sample from 0 mean Gaussian again.
-  TypeParam* gaussian_data_2 =
-      static_cast<TypeParam*>(this->data_2_->mutable_cpu_data());
-  this->RngGaussianFill(mu, sigma, gaussian_data_2);
-
-  // Multiply Gaussians.
-  for (int i = 0; i < this->sample_size_; ++i) {
-    gaussian_data_1[i] *= gaussian_data_2[i];
-  }
-
-  // Check that result has mean 0.
-  TypeParam mu_product = pow(mu, 2);
-  TypeParam sigma_product = sqrt(pow(sigma, 2) / 2);
-  this->RngGaussianChecks(mu_product, sigma_product, gaussian_data_1);
-}
-
-
-TYPED_TEST(RandomNumberGeneratorTest, TestRngUniformTimesUniform) {
-  // Sample from Uniform on [-2, 2].
-  const TypeParam lower_1 = -2;
-  const TypeParam upper_1 = -lower_1;
-  TypeParam* uniform_data_1 =
-      static_cast<TypeParam*>(this->data_->mutable_cpu_data());
-  this->RngUniformFill(lower_1, upper_1, uniform_data_1);
-
-  // Sample from Uniform on [-3, 3].
-  const TypeParam lower_2 = -3;
-  const TypeParam upper_2 = -lower_2;
-  TypeParam* uniform_data_2 =
-      static_cast<TypeParam*>(this->data_2_->mutable_cpu_data());
-  this->RngUniformFill(lower_2, upper_2, uniform_data_2);
-
-  // Multiply Uniforms.
-  for (int i = 0; i < this->sample_size_; ++i) {
-    uniform_data_1[i] *= uniform_data_2[i];
-  }
-
-  // Check that result does not violate checked properties of Uniform on [-6, 6]
-  // (though it is not actually uniformly distributed).
-  const TypeParam lower_prod = lower_1 * upper_2;
-  const TypeParam upper_prod = -lower_prod;
-  this->RngUniformChecks(lower_prod, upper_prod, uniform_data_1);
-}
-
-
-TYPED_TEST(RandomNumberGeneratorTest, TestRngGaussianTimesBernoulli) {
-  // Sample from 0 mean Gaussian.
-  const TypeParam mu = 0;
-  const TypeParam sigma = 1;
-  TypeParam* gaussian_data =
-      static_cast<TypeParam*>(this->data_->mutable_cpu_data());
-  this->RngGaussianFill(mu, sigma, gaussian_data);
-
-  // Sample from Bernoulli with p = 0.3.
-  const TypeParam bernoulli_p = 0.3;
-  int* bernoulli_data =
-      static_cast<int*>(this->int_data_->mutable_cpu_data());
-  this->RngBernoulliFill(bernoulli_p, bernoulli_data);
-
-  // Multiply Gaussian by Bernoulli.
-  for (int i = 0; i < this->sample_size_; ++i) {
-    gaussian_data[i] *= bernoulli_data[i];
-  }
-
-  // Check that result does not violate checked properties of sparsified
-  // Gaussian (though it is not actually a Gaussian).
-  this->RngGaussianChecks(mu, sigma, gaussian_data, 1 - bernoulli_p);
-}
-
-
-TYPED_TEST(RandomNumberGeneratorTest, TestRngUniformTimesBernoulli) {
-  // Sample from Uniform on [-1, 1].
-  const TypeParam lower = -1;
-  const TypeParam upper = 1;
-  TypeParam* uniform_data =
-      static_cast<TypeParam*>(this->data_->mutable_cpu_data());
-  this->RngUniformFill(lower, upper, uniform_data);
-
-  // Sample from Bernoulli with p = 0.3.
-  const TypeParam bernoulli_p = 0.3;
-  int* bernoulli_data =
-      static_cast<int*>(this->int_data_->mutable_cpu_data());
-  this->RngBernoulliFill(bernoulli_p, bernoulli_data);
-
-  // Multiply Uniform by Bernoulli.
-  for (int i = 0; i < this->sample_size_; ++i) {
-    uniform_data[i] *= bernoulli_data[i];
-  }
-
-  // Check that result does not violate checked properties of sparsified
-  // Uniform on [-1, 1] (though it is not actually uniformly distributed).
-  this->RngUniformChecks(lower, upper, uniform_data, 1 - bernoulli_p);
-}
-
-
-TYPED_TEST(RandomNumberGeneratorTest, TestRngBernoulliTimesBernoulli) {
-  // Sample from Bernoulli with p = 0.5.
-  const TypeParam p_a = 0.5;
-  int* bernoulli_data_a =
-      static_cast<int*>(this->int_data_->mutable_cpu_data());
-  this->RngBernoulliFill(p_a, bernoulli_data_a);
-
-  // Sample from Bernoulli with p = 0.3.
-  const TypeParam p_b = 0.3;
-  int* bernoulli_data_b =
-      static_cast<int*>(this->int_data_2_->mutable_cpu_data());
-  this->RngBernoulliFill(p_b, bernoulli_data_b);
-
-  // Multiply Bernoullis.
-  for (int i = 0; i < this->sample_size_; ++i) {
-    bernoulli_data_a[i] *= bernoulli_data_b[i];
-  }
-  int num_ones = 0;
-  for (int i = 0; i < this->sample_size_; ++i) {
-    if (bernoulli_data_a[i] != TypeParam(0)) {
-      EXPECT_EQ(TypeParam(1), bernoulli_data_a[i]);
-      ++num_ones;
-    }
-  }
-
-  // Check that resulting product has roughly p_a * p_b ones.
-  const TypeParam sample_p = this->sample_mean(bernoulli_data_a);
-  const TypeParam true_mean = p_a * p_b;
-  const TypeParam true_std = sqrt(true_mean * (1 - true_mean));
-  const TypeParam bound = this->mean_bound(true_std);
-  EXPECT_NEAR(true_mean, sample_p, bound);
-}
-
-#ifndef CPU_ONLY
-
-TYPED_TEST(RandomNumberGeneratorTest, TestRngGaussianGPU) {
-  const TypeParam mu = 0;
-  const TypeParam sigma = 1;
-  void* gaussian_gpu_data = this->data_->mutable_gpu_data();
-  this->RngGaussianFillGPU(mu, sigma, gaussian_gpu_data);
-  const void* gaussian_data = this->data_->cpu_data();
-  this->RngGaussianChecks(mu, sigma, gaussian_data);
-}
-
-
-TYPED_TEST(RandomNumberGeneratorTest, TestRngGaussian2GPU) {
-  const TypeParam mu = -2;
-  const TypeParam sigma = 3;
-  void* gaussian_gpu_data = this->data_->mutable_gpu_data();
-  this->RngGaussianFillGPU(mu, sigma, gaussian_gpu_data);
-  const void* gaussian_data = this->data_->cpu_data();
-  this->RngGaussianChecks(mu, sigma, gaussian_data);
-}
-
-
-TYPED_TEST(RandomNumberGeneratorTest, TestRngUniformGPU) {
-  const TypeParam lower = 0;
-  const TypeParam upper = 1;
-  void* uniform_gpu_data = this->data_->mutable_gpu_data();
-  this->RngUniformFillGPU(lower, upper, uniform_gpu_data);
-  const void* uniform_data = this->data_->cpu_data();
-  this->RngUniformChecks(lower, upper, uniform_data);
-}
-
-
-TYPED_TEST(RandomNumberGeneratorTest, TestRngUniform2GPU) {
-  const TypeParam lower = -7.3;
-  const TypeParam upper = -2.3;
-  void* uniform_gpu_data = this->data_->mutable_gpu_data();
-  this->RngUniformFillGPU(lower, upper, uniform_gpu_data);
-  const void* uniform_data = this->data_->cpu_data();
-  this->RngUniformChecks(lower, upper, uniform_data);
-}
-
-
-TYPED_TEST(RandomNumberGeneratorTest, TestRngUniformIntGPU) {
-  unsigned int* uniform_uint_gpu_data =
-      static_cast<unsigned int*>(this->int_data_->mutable_gpu_data());
-  this->RngUniformIntFillGPU(uniform_uint_gpu_data);
-  const unsigned int* uniform_uint_data =
-      static_cast<const unsigned int*>(this->int_data_->cpu_data());
-  TypeParam* uniform_data =
-      static_cast<TypeParam*>(this->data_->mutable_cpu_data());
-  for (int i = 0; i < this->sample_size_; ++i) {
-    uniform_data[i] = static_cast<const TypeParam>(uniform_uint_data[i]);
-  }
-  const TypeParam lower = 0;
-  const TypeParam upper = UINT_MAX;
-  this->RngUniformChecks(lower, upper, uniform_data);
-}
-
-
-TYPED_TEST(RandomNumberGeneratorTest, TestRngGaussianTimesGaussianGPU) {
-  const TypeParam mu = 0;
-  const TypeParam sigma = 1;
-
-  // Sample from 0 mean Gaussian.
-  TypeParam* gaussian_gpu_data_1 =
-      static_cast<TypeParam*>(this->data_->mutable_gpu_data());
-  this->RngGaussianFillGPU(mu, sigma, gaussian_gpu_data_1);
-
-  // Sample from 0 mean Gaussian again.
-  TypeParam* gaussian_gpu_data_2 =
-      static_cast<TypeParam*>(this->data_2_->mutable_gpu_data());
-  this->RngGaussianFillGPU(mu, sigma, gaussian_gpu_data_2);
-
-  // Multiply Gaussians.
-  TypeParam* gaussian_data_1 =
-      static_cast<TypeParam*>(this->data_->mutable_cpu_data());
-  const TypeParam* gaussian_data_2 =
-      static_cast<const TypeParam*>(this->data_2_->cpu_data());
-  for (int i = 0; i < this->sample_size_; ++i) {
-    gaussian_data_1[i] *= gaussian_data_2[i];
-  }
-
-  // Check that result does not violate checked properties of Gaussian
-  // (though it is not actually a Gaussian).
-  TypeParam mu_product = pow(mu, 2);
-  TypeParam sigma_product = sqrt(pow(sigma, 2) / 2);
-  this->RngGaussianChecks(mu_product, sigma_product, gaussian_data_1);
-}
-
-
-TYPED_TEST(RandomNumberGeneratorTest, TestRngUniformTimesUniformGPU) {
-  // Sample from Uniform on [-2, 2].
-  const TypeParam lower_1 = -2;
-  const TypeParam upper_1 = -lower_1;
-  TypeParam* uniform_gpu_data_1 =
-      static_cast<TypeParam*>(this->data_->mutable_gpu_data());
-  this->RngUniformFillGPU(lower_1, upper_1, uniform_gpu_data_1);
-
-  // Sample from Uniform on [-3, 3].
-  const TypeParam lower_2 = -3;
-  const TypeParam upper_2 = -lower_2;
-  TypeParam* uniform_gpu_data_2 =
-      static_cast<TypeParam*>(this->data_2_->mutable_gpu_data());
-  this->RngUniformFillGPU(lower_2, upper_2, uniform_gpu_data_2);
-
-  // Multiply Uniforms.
-  TypeParam* uniform_data_1 =
-      static_cast<TypeParam*>(this->data_->mutable_cpu_data());
-  const TypeParam* uniform_data_2 =
-      static_cast<const TypeParam*>(this->data_2_->cpu_data());
-  for (int i = 0; i < this->sample_size_; ++i) {
-    uniform_data_1[i] *= uniform_data_2[i];
-  }
-
-  // Check that result does not violate properties of Uniform on [-7, -3].
-  const TypeParam lower_prod = lower_1 * upper_2;
-  const TypeParam upper_prod = -lower_prod;
-  this->RngUniformChecks(lower_prod, upper_prod, uniform_data_1);
-}
-
-#endif
-
-}  // namespace caffe
diff --git a/src/caffe/test/test_reduction_layer.cpp b/src/caffe/test/test_reduction_layer.cpp
deleted file mode 100644
index 6ed7cda..0000000
--- a/src/caffe/test/test_reduction_layer.cpp
+++ /dev/null
@@ -1,296 +0,0 @@
-#include <vector>
-
-#include "gtest/gtest.h"
-
-#include "caffe/blob.hpp"
-#include "caffe/common.hpp"
-#include "caffe/filler.hpp"
-#include "caffe/layers/reduction_layer.hpp"
-
-#include "caffe/test/test_caffe_main.hpp"
-#include "caffe/test/test_gradient_check_util.hpp"
-
-namespace caffe {
-
-template <typename TypeParam>
-class ReductionLayerTest : public MultiDeviceTest<TypeParam> {
-  typedef typename TypeParam::Dtype Dtype;
-
- protected:
-  ReductionLayerTest()
-      : blob_bottom_(new Blob<Dtype>(2, 3, 4, 5)),
-        blob_top_(new Blob<Dtype>()) {
-    // fill the values
-    Caffe::set_random_seed(1701);
-    FillerParameter filler_param;
-    UniformFiller<Dtype> filler(filler_param);
-    filler.Fill(this->blob_bottom_);
-    blob_bottom_vec_.push_back(blob_bottom_);
-    blob_top_vec_.push_back(blob_top_);
-  }
-  virtual ~ReductionLayerTest() {
-    delete blob_bottom_;
-    delete blob_top_;
-  }
-
-  void TestForward(ReductionParameter_ReductionOp op,
-                   float coeff = 1, int axis = 0) {
-    LayerParameter layer_param;
-    ReductionParameter* reduction_param = layer_param.mutable_reduction_param();
-    reduction_param->set_operation(op);
-    if (coeff != 1.0) { reduction_param->set_coeff(coeff); }
-    if (axis != 0) { reduction_param->set_axis(axis); }
-    shared_ptr<ReductionLayer<Dtype> > layer(
-        new ReductionLayer<Dtype>(layer_param));
-    layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-    layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-    const Dtype* in_data = this->blob_bottom_->cpu_data();
-    const int num = this->blob_bottom_->count(0, axis);
-    const int dim = this->blob_bottom_->count(axis);
-    for (int n = 0; n < num; ++n) {
-      Dtype expected_result = 0;
-      for (int d = 0; d < dim; ++d) {
-        switch (op) {
-          case ReductionParameter_ReductionOp_SUM:
-            expected_result += *in_data;
-            break;
-          case ReductionParameter_ReductionOp_MEAN:
-            expected_result += *in_data / dim;
-            break;
-          case ReductionParameter_ReductionOp_ASUM:
-            expected_result += fabs(*in_data);
-            break;
-          case ReductionParameter_ReductionOp_SUMSQ:
-            expected_result += (*in_data) * (*in_data);
-            break;
-          default:
-            LOG(FATAL) << "Unknown reduction op: "
-                << ReductionParameter_ReductionOp_Name(op);
-        }
-        ++in_data;
-      }
-      expected_result *= coeff;
-      const Dtype computed_result = this->blob_top_->cpu_data()[n];
-      EXPECT_FLOAT_EQ(expected_result, computed_result)
-          << "Incorrect result computed with op "
-          << ReductionParameter_ReductionOp_Name(op) << ", coeff " << coeff;
-    }
-  }
-
-  void TestGradient(ReductionParameter_ReductionOp op,
-                    float coeff = 1, int axis = 0) {
-    typedef typename TypeParam::Dtype Dtype;
-    LayerParameter layer_param;
-    ReductionParameter* reduction_param = layer_param.mutable_reduction_param();
-    reduction_param->set_operation(op);
-    reduction_param->set_coeff(coeff);
-    reduction_param->set_axis(axis);
-    ReductionLayer<Dtype> layer(layer_param);
-    GradientChecker<Dtype> checker(1e-2, 2e-3);
-    checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-        this->blob_top_vec_);
-  }
-
-  Blob<Dtype>* const blob_bottom_;
-  Blob<Dtype>* const blob_top_;
-  vector<Blob<Dtype>*> blob_bottom_vec_;
-  vector<Blob<Dtype>*> blob_top_vec_;
-};
-
-TYPED_TEST_CASE(ReductionLayerTest, TestDtypesAndDevices);
-
-TYPED_TEST(ReductionLayerTest, TestSetUp) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  shared_ptr<ReductionLayer<Dtype> > layer(
-      new ReductionLayer<Dtype>(layer_param));
-  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  ASSERT_EQ(this->blob_top_->num_axes(), 0);
-}
-
-TYPED_TEST(ReductionLayerTest, TestSetUpWithAxis1) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  layer_param.mutable_reduction_param()->set_axis(1);
-  shared_ptr<ReductionLayer<Dtype> > layer(
-      new ReductionLayer<Dtype>(layer_param));
-  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  ASSERT_EQ(this->blob_top_->num_axes(), 1);
-  EXPECT_EQ(this->blob_top_->shape(0), 2);
-}
-
-TYPED_TEST(ReductionLayerTest, TestSetUpWithAxis2) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  layer_param.mutable_reduction_param()->set_axis(2);
-  shared_ptr<ReductionLayer<Dtype> > layer(
-      new ReductionLayer<Dtype>(layer_param));
-  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  ASSERT_EQ(this->blob_top_->num_axes(), 2);
-  EXPECT_EQ(this->blob_top_->shape(0), 2);
-  EXPECT_EQ(this->blob_top_->shape(1), 3);
-}
-
-TYPED_TEST(ReductionLayerTest, TestSum) {
-  const ReductionParameter_ReductionOp kOp = ReductionParameter_ReductionOp_SUM;
-  this->TestForward(kOp);
-}
-
-TYPED_TEST(ReductionLayerTest, TestSumCoeff) {
-  const ReductionParameter_ReductionOp kOp = ReductionParameter_ReductionOp_SUM;
-  const float kCoeff = 2.3;
-  this->TestForward(kOp, kCoeff);
-}
-
-TYPED_TEST(ReductionLayerTest, TestSumCoeffAxis1) {
-  const ReductionParameter_ReductionOp kOp = ReductionParameter_ReductionOp_SUM;
-  const float kCoeff = 2.3;
-  const int kAxis = 1;
-  this->TestForward(kOp, kCoeff, kAxis);
-}
-
-TYPED_TEST(ReductionLayerTest, TestSumGradient) {
-  const ReductionParameter_ReductionOp kOp = ReductionParameter_ReductionOp_SUM;
-  this->TestGradient(kOp);
-}
-
-TYPED_TEST(ReductionLayerTest, TestSumCoeffGradient) {
-  const ReductionParameter_ReductionOp kOp = ReductionParameter_ReductionOp_SUM;
-  const float kCoeff = 2.3;
-  this->TestGradient(kOp, kCoeff);
-}
-
-TYPED_TEST(ReductionLayerTest, TestSumCoeffAxis1Gradient) {
-  const ReductionParameter_ReductionOp kOp = ReductionParameter_ReductionOp_SUM;
-  const float kCoeff = 2.3;
-  const int kAxis = 1;
-  this->TestGradient(kOp, kCoeff, kAxis);
-}
-
-TYPED_TEST(ReductionLayerTest, TestMean) {
-  const ReductionParameter_ReductionOp kOp =
-      ReductionParameter_ReductionOp_MEAN;
-  this->TestForward(kOp);
-}
-
-TYPED_TEST(ReductionLayerTest, TestMeanCoeff) {
-  const ReductionParameter_ReductionOp kOp =
-      ReductionParameter_ReductionOp_MEAN;
-  const float kCoeff = 2.3;
-  this->TestForward(kOp, kCoeff);
-}
-
-TYPED_TEST(ReductionLayerTest, TestMeanCoeffAxis1) {
-  const ReductionParameter_ReductionOp kOp =
-      ReductionParameter_ReductionOp_MEAN;
-  const float kCoeff = 2.3;
-  const int kAxis = 1;
-  this->TestForward(kOp, kCoeff, kAxis);
-}
-
-TYPED_TEST(ReductionLayerTest, TestMeanGradient) {
-  const ReductionParameter_ReductionOp kOp =
-      ReductionParameter_ReductionOp_MEAN;
-  this->TestGradient(kOp);
-}
-
-TYPED_TEST(ReductionLayerTest, TestMeanCoeffGradient) {
-  const ReductionParameter_ReductionOp kOp =
-      ReductionParameter_ReductionOp_MEAN;
-  const float kCoeff = 2.3;
-  this->TestGradient(kOp, kCoeff);
-}
-
-TYPED_TEST(ReductionLayerTest, TestMeanCoeffGradientAxis1) {
-  const ReductionParameter_ReductionOp kOp =
-      ReductionParameter_ReductionOp_MEAN;
-  const float kCoeff = 2.3;
-  const int kAxis = 1;
-  this->TestGradient(kOp, kCoeff, kAxis);
-}
-
-TYPED_TEST(ReductionLayerTest, TestAbsSum) {
-  const ReductionParameter_ReductionOp kOp =
-      ReductionParameter_ReductionOp_ASUM;
-  this->TestForward(kOp);
-}
-
-TYPED_TEST(ReductionLayerTest, TestAbsSumCoeff) {
-  const ReductionParameter_ReductionOp kOp =
-      ReductionParameter_ReductionOp_ASUM;
-  const float kCoeff = 2.3;
-  this->TestForward(kOp, kCoeff);
-}
-
-TYPED_TEST(ReductionLayerTest, TestAbsSumCoeffAxis1) {
-  const ReductionParameter_ReductionOp kOp =
-      ReductionParameter_ReductionOp_ASUM;
-  const float kCoeff = 2.3;
-  const int kAxis = 1;
-  this->TestForward(kOp, kCoeff, kAxis);
-}
-
-TYPED_TEST(ReductionLayerTest, TestAbsSumGradient) {
-  const ReductionParameter_ReductionOp kOp =
-      ReductionParameter_ReductionOp_ASUM;
-  this->TestGradient(kOp);
-}
-
-TYPED_TEST(ReductionLayerTest, TestAbsSumCoeffGradient) {
-  const ReductionParameter_ReductionOp kOp =
-      ReductionParameter_ReductionOp_ASUM;
-  const float kCoeff = 2.3;
-  this->TestGradient(kOp, kCoeff);
-}
-
-TYPED_TEST(ReductionLayerTest, TestAbsSumCoeffAxis1Gradient) {
-  const ReductionParameter_ReductionOp kOp =
-      ReductionParameter_ReductionOp_ASUM;
-  const float kCoeff = 2.3;
-  const int kAxis = 1;
-  this->TestGradient(kOp, kCoeff, kAxis);
-}
-
-TYPED_TEST(ReductionLayerTest, TestSumOfSquares) {
-  const ReductionParameter_ReductionOp kOp =
-      ReductionParameter_ReductionOp_SUMSQ;
-  this->TestForward(kOp);
-}
-
-TYPED_TEST(ReductionLayerTest, TestSumOfSquaresCoeff) {
-  const ReductionParameter_ReductionOp kOp =
-      ReductionParameter_ReductionOp_SUMSQ;
-  const float kCoeff = 2.3;
-  this->TestForward(kOp, kCoeff);
-}
-
-TYPED_TEST(ReductionLayerTest, TestSumOfSquaresCoeffAxis1) {
-  const ReductionParameter_ReductionOp kOp =
-      ReductionParameter_ReductionOp_SUMSQ;
-  const float kCoeff = 2.3;
-  const int kAxis = 1;
-  this->TestForward(kOp, kCoeff, kAxis);
-}
-
-TYPED_TEST(ReductionLayerTest, TestSumOfSquaresGradient) {
-  const ReductionParameter_ReductionOp kOp =
-      ReductionParameter_ReductionOp_SUMSQ;
-  this->TestGradient(kOp);
-}
-
-TYPED_TEST(ReductionLayerTest, TestSumOfSquaresCoeffGradient) {
-  const ReductionParameter_ReductionOp kOp =
-      ReductionParameter_ReductionOp_SUMSQ;
-  const float kCoeff = 2.3;
-  this->TestGradient(kOp, kCoeff);
-}
-
-TYPED_TEST(ReductionLayerTest, TestSumOfSquaresCoeffAxis1Gradient) {
-  const ReductionParameter_ReductionOp kOp =
-      ReductionParameter_ReductionOp_SUMSQ;
-  const float kCoeff = 2.3;
-  const int kAxis = 1;
-  this->TestGradient(kOp, kCoeff, kAxis);
-}
-
-}  // namespace caffe
diff --git a/src/caffe/test/test_reshape_layer.cpp b/src/caffe/test/test_reshape_layer.cpp
deleted file mode 100644
index 4f26138..0000000
--- a/src/caffe/test/test_reshape_layer.cpp
+++ /dev/null
@@ -1,279 +0,0 @@
-#include <vector>
-
-#include "gtest/gtest.h"
-
-#include "caffe/blob.hpp"
-#include "caffe/common.hpp"
-#include "caffe/filler.hpp"
-#include "caffe/layers/reshape_layer.hpp"
-
-#include "caffe/test/test_caffe_main.hpp"
-#include "caffe/test/test_gradient_check_util.hpp"
-
-namespace caffe {
-
-template <typename TypeParam>
-class ReshapeLayerTest : public MultiDeviceTest<TypeParam> {
-  typedef typename TypeParam::Dtype Dtype;
- protected:
-  ReshapeLayerTest()
-    : blob_bottom_(new Blob<Dtype>(2, 3, 6, 5)),
-      blob_top_(new Blob<Dtype>()) {
-    // fill the values
-    FillerParameter filler_param;
-    GaussianFiller<Dtype> filler(filler_param);
-    filler.Fill(this->blob_bottom_);
-    blob_bottom_vec_.push_back(blob_bottom_);
-    blob_top_vec_.push_back(blob_top_);
-  }
-
-  virtual ~ReshapeLayerTest() { delete blob_bottom_; delete blob_top_; }
-
-  Blob<Dtype>* const blob_bottom_;
-  Blob<Dtype>* const blob_top_;
-  vector<Blob<Dtype>*> blob_bottom_vec_;
-  vector<Blob<Dtype>*> blob_top_vec_;
-};
-
-TYPED_TEST_CASE(ReshapeLayerTest, TestDtypesAndDevices);
-
-TYPED_TEST(ReshapeLayerTest, TestFlattenOutputSizes) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  BlobShape* blob_shape = layer_param.mutable_reshape_param()->mutable_shape();
-  blob_shape->add_dim(0);
-  blob_shape->add_dim(-1);
-  blob_shape->add_dim(1);
-  blob_shape->add_dim(1);
-
-  ReshapeLayer<Dtype> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  EXPECT_EQ(this->blob_top_->num(), 2);
-  EXPECT_EQ(this->blob_top_->channels(), 3 * 6 * 5);
-  EXPECT_EQ(this->blob_top_->height(), 1);
-  EXPECT_EQ(this->blob_top_->width(), 1);
-}
-
-TYPED_TEST(ReshapeLayerTest, TestFlattenValues) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  BlobShape* blob_shape = layer_param.mutable_reshape_param()->mutable_shape();
-  blob_shape->add_dim(0);
-  blob_shape->add_dim(-1);
-  blob_shape->add_dim(1);
-  blob_shape->add_dim(1);
-  ReshapeLayer<Dtype> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  for (int c = 0; c < 3 * 6 * 5; ++c) {
-    EXPECT_EQ(this->blob_top_->data_at(0, c, 0, 0),
-        this->blob_bottom_->data_at(0, c / (6 * 5), (c / 5) % 6, c % 5));
-    EXPECT_EQ(this->blob_top_->data_at(1, c, 0, 0),
-        this->blob_bottom_->data_at(1, c / (6 * 5), (c / 5) % 6, c % 5));
-  }
-}
-
-// Test whether setting output dimensions to 0 either explicitly or implicitly
-// copies the respective dimension of the input layer.
-TYPED_TEST(ReshapeLayerTest, TestCopyDimensions) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  BlobShape* blob_shape = layer_param.mutable_reshape_param()->mutable_shape();
-  blob_shape->add_dim(0);
-  blob_shape->add_dim(0);
-  blob_shape->add_dim(0);
-  blob_shape->add_dim(0);
-  ReshapeLayer<Dtype> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-
-  EXPECT_EQ(this->blob_top_->num(), 2);
-  EXPECT_EQ(this->blob_top_->channels(), 3);
-  EXPECT_EQ(this->blob_top_->height(), 6);
-  EXPECT_EQ(this->blob_top_->width(), 5);
-}
-
-// When a dimension is set to -1, we should infer its value from the other
-// dimensions (including those that get copied from below).
-TYPED_TEST(ReshapeLayerTest, TestInferenceOfUnspecified) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  BlobShape* blob_shape = layer_param.mutable_reshape_param()->mutable_shape();
-  blob_shape->add_dim(0);
-  blob_shape->add_dim(3);
-  blob_shape->add_dim(10);
-  blob_shape->add_dim(-1);
-
-  // Count is 180, thus height should be 180 / (2*3*10) = 3.
-
-  ReshapeLayer<Dtype> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-
-  EXPECT_EQ(this->blob_top_->num(), 2);
-  EXPECT_EQ(this->blob_top_->channels(), 3);
-  EXPECT_EQ(this->blob_top_->height(), 10);
-  EXPECT_EQ(this->blob_top_->width(), 3);
-}
-
-TYPED_TEST(ReshapeLayerTest, TestInferenceOfUnspecifiedWithStartAxis) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  layer_param.mutable_reshape_param()->set_axis(1);
-  BlobShape* blob_shape = layer_param.mutable_reshape_param()->mutable_shape();
-  blob_shape->add_dim(3);
-  blob_shape->add_dim(10);
-  blob_shape->add_dim(-1);
-
-  ReshapeLayer<Dtype> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-
-  ASSERT_EQ(this->blob_top_->num_axes(), 4);
-  EXPECT_EQ(this->blob_top_->num(), 2);
-  EXPECT_EQ(this->blob_top_->channels(), 3);
-  EXPECT_EQ(this->blob_top_->height(), 10);
-  EXPECT_EQ(this->blob_top_->width(), 3);
-}
-
-TYPED_TEST(ReshapeLayerTest, TestInsertSingletonAxesStart) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  layer_param.mutable_reshape_param()->set_axis(0);
-  layer_param.mutable_reshape_param()->set_num_axes(0);
-  BlobShape* blob_shape = layer_param.mutable_reshape_param()->mutable_shape();
-  blob_shape->add_dim(1);
-  blob_shape->add_dim(1);
-  blob_shape->add_dim(1);
-
-  ReshapeLayer<Dtype> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-
-  ASSERT_EQ(this->blob_top_->num_axes(), 7);
-  EXPECT_EQ(this->blob_top_->shape(0), 1);
-  EXPECT_EQ(this->blob_top_->shape(1), 1);
-  EXPECT_EQ(this->blob_top_->shape(2), 1);
-  EXPECT_EQ(this->blob_top_->shape(3), 2);
-  EXPECT_EQ(this->blob_top_->shape(4), 3);
-  EXPECT_EQ(this->blob_top_->shape(5), 6);
-  EXPECT_EQ(this->blob_top_->shape(6), 5);
-}
-
-TYPED_TEST(ReshapeLayerTest, TestInsertSingletonAxesMiddle) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  layer_param.mutable_reshape_param()->set_axis(2);
-  layer_param.mutable_reshape_param()->set_num_axes(0);
-  BlobShape* blob_shape = layer_param.mutable_reshape_param()->mutable_shape();
-  blob_shape->add_dim(1);
-  blob_shape->add_dim(1);
-  blob_shape->add_dim(1);
-
-  ReshapeLayer<Dtype> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-
-  ASSERT_EQ(this->blob_top_->num_axes(), 7);
-  EXPECT_EQ(this->blob_top_->shape(0), 2);
-  EXPECT_EQ(this->blob_top_->shape(1), 3);
-  EXPECT_EQ(this->blob_top_->shape(2), 1);
-  EXPECT_EQ(this->blob_top_->shape(3), 1);
-  EXPECT_EQ(this->blob_top_->shape(4), 1);
-  EXPECT_EQ(this->blob_top_->shape(5), 6);
-  EXPECT_EQ(this->blob_top_->shape(6), 5);
-}
-
-TYPED_TEST(ReshapeLayerTest, TestInsertSingletonAxesEnd) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  layer_param.mutable_reshape_param()->set_axis(-1);
-  layer_param.mutable_reshape_param()->set_num_axes(0);
-  BlobShape* blob_shape = layer_param.mutable_reshape_param()->mutable_shape();
-  blob_shape->add_dim(1);
-  blob_shape->add_dim(1);
-  blob_shape->add_dim(1);
-
-  ReshapeLayer<Dtype> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-
-  ASSERT_EQ(this->blob_top_->num_axes(), 7);
-  EXPECT_EQ(this->blob_top_->shape(0), 2);
-  EXPECT_EQ(this->blob_top_->shape(1), 3);
-  EXPECT_EQ(this->blob_top_->shape(2), 6);
-  EXPECT_EQ(this->blob_top_->shape(3), 5);
-  EXPECT_EQ(this->blob_top_->shape(4), 1);
-  EXPECT_EQ(this->blob_top_->shape(5), 1);
-  EXPECT_EQ(this->blob_top_->shape(6), 1);
-}
-
-TYPED_TEST(ReshapeLayerTest, TestFlattenMiddle) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  layer_param.mutable_reshape_param()->set_axis(1);
-  layer_param.mutable_reshape_param()->set_num_axes(2);
-  BlobShape* blob_shape = layer_param.mutable_reshape_param()->mutable_shape();
-  blob_shape->add_dim(-1);
-
-  ReshapeLayer<Dtype> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-
-  ASSERT_EQ(this->blob_top_->num_axes(), 3);
-  EXPECT_EQ(this->blob_top_->shape(0), 2);
-  EXPECT_EQ(this->blob_top_->shape(1), 3 * 6);
-  EXPECT_EQ(this->blob_top_->shape(2), 5);
-}
-
-TYPED_TEST(ReshapeLayerTest, TestForward) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  BlobShape* shape = layer_param.mutable_reshape_param()->mutable_shape();
-  shape->add_dim(6);
-  shape->add_dim(2);
-  shape->add_dim(3);
-  shape->add_dim(5);
-  ReshapeLayer<Dtype> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  for (int i = 0; i < this->blob_bottom_->count(); ++i) {
-    EXPECT_EQ(this->blob_top_->cpu_data()[i],
-              this->blob_bottom_->cpu_data()[i]);
-  }
-}
-
-TYPED_TEST(ReshapeLayerTest, TestForwardAfterReshape) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  BlobShape* shape = layer_param.mutable_reshape_param()->mutable_shape();
-  shape->add_dim(6);
-  shape->add_dim(2);
-  shape->add_dim(3);
-  shape->add_dim(5);
-  ReshapeLayer<Dtype> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  // We know the above produced the correct result from TestForward.
-  // Reshape the bottom and call layer.Reshape, then try again.
-  vector<int> new_bottom_shape(1, 2 * 3 * 6 * 5);
-  this->blob_bottom_->Reshape(new_bottom_shape);
-  layer.Reshape(this->blob_bottom_vec_, this->blob_top_vec_);
-  FillerParameter filler_param;
-  GaussianFiller<Dtype> filler(filler_param);
-  filler.Fill(this->blob_bottom_);
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  for (int i = 0; i < this->blob_bottom_->count(); ++i) {
-    EXPECT_EQ(this->blob_top_->cpu_data()[i],
-              this->blob_bottom_->cpu_data()[i]);
-  }
-}
-
-TYPED_TEST(ReshapeLayerTest, TestGradient) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  BlobShape* shape = layer_param.mutable_reshape_param()->mutable_shape();
-  shape->add_dim(6);
-  shape->add_dim(2);
-  shape->add_dim(3);
-  shape->add_dim(5);
-  ReshapeLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-2, 1e-2);
-  checker.CheckGradientEltwise(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_);
-}
-
-}  // namespace caffe
diff --git a/src/caffe/test/test_rnn_layer.cpp b/src/caffe/test/test_rnn_layer.cpp
deleted file mode 100644
index dd8952d..0000000
--- a/src/caffe/test/test_rnn_layer.cpp
+++ /dev/null
@@ -1,217 +0,0 @@
-#include <cstring>
-#include <vector>
-
-#include "gtest/gtest.h"
-
-#include "caffe/blob.hpp"
-#include "caffe/common.hpp"
-#include "caffe/filler.hpp"
-#include "caffe/layers/rnn_layer.hpp"
-
-#include "caffe/test/test_caffe_main.hpp"
-#include "caffe/test/test_gradient_check_util.hpp"
-
-namespace caffe {
-
-template <typename TypeParam>
-class RNNLayerTest : public MultiDeviceTest<TypeParam> {
-  typedef typename TypeParam::Dtype Dtype;
-
- protected:
-  RNNLayerTest() : num_output_(7) {
-    blob_bottom_vec_.push_back(&blob_bottom_);
-    blob_bottom_vec_.push_back(&blob_bottom_cont_);
-    blob_top_vec_.push_back(&blob_top_);
-
-    ReshapeBlobs(1, 3);
-
-    layer_param_.mutable_recurrent_param()->set_num_output(num_output_);
-    FillerParameter* weight_filler =
-        layer_param_.mutable_recurrent_param()->mutable_weight_filler();
-    weight_filler->set_type("gaussian");
-    weight_filler->set_std(0.2);
-    FillerParameter* bias_filler =
-        layer_param_.mutable_recurrent_param()->mutable_bias_filler();
-    bias_filler->set_type("gaussian");
-    bias_filler->set_std(0.1);
-
-    layer_param_.set_phase(TEST);
-  }
-
-  void ReshapeBlobs(int num_timesteps, int num_instances) {
-    blob_bottom_.Reshape(num_timesteps, num_instances, 3, 2);
-    blob_bottom_static_.Reshape(num_instances, 2, 3, 4);
-    vector<int> shape(2);
-    shape[0] = num_timesteps;
-    shape[1] = num_instances;
-    blob_bottom_cont_.Reshape(shape);
-
-    FillerParameter filler_param;
-    filler_param.set_min(-1);
-    filler_param.set_max(1);
-    UniformFiller<Dtype> filler(filler_param);
-    filler.Fill(&blob_bottom_);
-  }
-
-  int num_output_;
-  LayerParameter layer_param_;
-  Blob<Dtype> blob_bottom_;
-  Blob<Dtype> blob_bottom_cont_;
-  Blob<Dtype> blob_bottom_static_;
-  Blob<Dtype> blob_top_;
-  vector<Blob<Dtype>*> blob_bottom_vec_;
-  vector<Blob<Dtype>*> blob_top_vec_;
-};
-
-TYPED_TEST_CASE(RNNLayerTest, TestDtypesAndDevices);
-
-TYPED_TEST(RNNLayerTest, TestSetUp) {
-  typedef typename TypeParam::Dtype Dtype;
-  RNNLayer<Dtype> layer(this->layer_param_);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  vector<int> expected_top_shape = this->blob_bottom_.shape();
-  expected_top_shape.resize(3);
-  expected_top_shape[2] = this->num_output_;
-  EXPECT_TRUE(this->blob_top_.shape() == expected_top_shape);
-}
-
-TYPED_TEST(RNNLayerTest, TestForward) {
-  typedef typename TypeParam::Dtype Dtype;
-  const int kNumTimesteps = 3;
-  const int num = this->blob_bottom_.shape(1);
-  this->ReshapeBlobs(kNumTimesteps, num);
-
-  // Fill the cont blob with <0, 1, 1, ..., 1>,
-  // indicating a sequence that begins at the first timestep
-  // then continues for the rest of the sequence.
-  for (int t = 0; t < kNumTimesteps; ++t) {
-    for (int n = 0; n < num; ++n) {
-      this->blob_bottom_cont_.mutable_cpu_data()[t * num + n] = t > 0;
-    }
-  }
-
-  // Process the full sequence in a single batch.
-  FillerParameter filler_param;
-  filler_param.set_mean(0);
-  filler_param.set_std(1);
-  GaussianFiller<Dtype> sequence_filler(filler_param);
-  sequence_filler.Fill(&this->blob_bottom_);
-  shared_ptr<RNNLayer<Dtype> > layer(new RNNLayer<Dtype>(this->layer_param_));
-  Caffe::set_random_seed(1701);
-  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  LOG(INFO) << "Calling forward for full sequence RNN";
-  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-
-  // Copy the inputs and outputs to reuse/check them later.
-  Blob<Dtype> bottom_copy(this->blob_bottom_.shape());
-  bottom_copy.CopyFrom(this->blob_bottom_);
-  Blob<Dtype> top_copy(this->blob_top_.shape());
-  top_copy.CopyFrom(this->blob_top_);
-
-  // Process the batch one timestep at a time;
-  // check that we get the same result.
-  this->ReshapeBlobs(1, num);
-  layer.reset(new RNNLayer<Dtype>(this->layer_param_));
-  Caffe::set_random_seed(1701);
-  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  const int bottom_count = this->blob_bottom_.count();
-  const int top_count = this->blob_top_.count();
-  const Dtype kEpsilon = 1e-5;
-  for (int t = 0; t < kNumTimesteps; ++t) {
-    caffe_copy(bottom_count, bottom_copy.cpu_data() + t * bottom_count,
-               this->blob_bottom_.mutable_cpu_data());
-    for (int n = 0; n < num; ++n) {
-      this->blob_bottom_cont_.mutable_cpu_data()[n] = t > 0;
-    }
-    LOG(INFO) << "Calling forward for RNN timestep " << t;
-    layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-    for (int i = 0; i < top_count; ++i) {
-      ASSERT_LT(t * top_count + i, top_copy.count());
-      EXPECT_NEAR(this->blob_top_.cpu_data()[i],
-                  top_copy.cpu_data()[t * top_count + i], kEpsilon)
-         << "t = " << t << "; i = " << i;
-    }
-  }
-
-  // Process the batch one timestep at a time with all cont blobs set to 0.
-  // Check that we get a different result, except in the first timestep.
-  Caffe::set_random_seed(1701);
-  layer.reset(new RNNLayer<Dtype>(this->layer_param_));
-  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  for (int t = 0; t < kNumTimesteps; ++t) {
-    caffe_copy(bottom_count, bottom_copy.cpu_data() + t * bottom_count,
-               this->blob_bottom_.mutable_cpu_data());
-    for (int n = 0; n < num; ++n) {
-      this->blob_bottom_cont_.mutable_cpu_data()[n] = 0;
-    }
-    LOG(INFO) << "Calling forward for RNN timestep " << t;
-    layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-    for (int i = 0; i < top_count; ++i) {
-      if (t == 0) {
-        EXPECT_NEAR(this->blob_top_.cpu_data()[i],
-                    top_copy.cpu_data()[t * top_count + i], kEpsilon)
-           << "t = " << t << "; i = " << i;
-      } else {
-        EXPECT_NE(this->blob_top_.cpu_data()[i],
-                  top_copy.cpu_data()[t * top_count + i])
-           << "t = " << t << "; i = " << i;
-      }
-    }
-  }
-}
-
-TYPED_TEST(RNNLayerTest, TestGradient) {
-  typedef typename TypeParam::Dtype Dtype;
-  RNNLayer<Dtype> layer(this->layer_param_);
-  GradientChecker<Dtype> checker(1e-2, 1e-3);
-  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_, 0);
-}
-
-TYPED_TEST(RNNLayerTest, TestGradientNonZeroCont) {
-  typedef typename TypeParam::Dtype Dtype;
-  RNNLayer<Dtype> layer(this->layer_param_);
-  GradientChecker<Dtype> checker(1e-2, 1e-3);
-  for (int i = 0; i < this->blob_bottom_cont_.count(); ++i) {
-    this->blob_bottom_cont_.mutable_cpu_data()[i] = i > 2;
-  }
-  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_, 0);
-}
-
-TYPED_TEST(RNNLayerTest, TestGradientNonZeroContBufferSize2) {
-  typedef typename TypeParam::Dtype Dtype;
-  this->ReshapeBlobs(2, 2);
-  // fill the values
-  FillerParameter filler_param;
-  UniformFiller<Dtype> filler(filler_param);
-  filler.Fill(&this->blob_bottom_);
-  RNNLayer<Dtype> layer(this->layer_param_);
-  GradientChecker<Dtype> checker(1e-2, 1e-3);
-  for (int i = 0; i < this->blob_bottom_cont_.count(); ++i) {
-    this->blob_bottom_cont_.mutable_cpu_data()[i] = i > 2;
-  }
-  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_, 0);
-}
-
-TYPED_TEST(RNNLayerTest, TestGradientNonZeroContBufferSize2WithStaticInput) {
-  typedef typename TypeParam::Dtype Dtype;
-  this->ReshapeBlobs(2, 2);
-  FillerParameter filler_param;
-  UniformFiller<Dtype> filler(filler_param);
-  filler.Fill(&this->blob_bottom_);
-  filler.Fill(&this->blob_bottom_static_);
-  this->blob_bottom_vec_.push_back(&this->blob_bottom_static_);
-  RNNLayer<Dtype> layer(this->layer_param_);
-  GradientChecker<Dtype> checker(1e-2, 1e-3);
-  for (int i = 0; i < this->blob_bottom_cont_.count(); ++i) {
-    this->blob_bottom_cont_.mutable_cpu_data()[i] = i > 2;
-  }
-  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_, 0);
-  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_, 2);
-}
-
-}  // namespace caffe
diff --git a/src/caffe/test/test_scale_layer.cpp b/src/caffe/test/test_scale_layer.cpp
deleted file mode 100644
index ad11679..0000000
--- a/src/caffe/test/test_scale_layer.cpp
+++ /dev/null
@@ -1,507 +0,0 @@
-#include <algorithm>
-#include <vector>
-
-#include "gtest/gtest.h"
-
-#include "caffe/blob.hpp"
-#include "caffe/common.hpp"
-#include "caffe/filler.hpp"
-#include "caffe/layers/scale_layer.hpp"
-
-#include "caffe/test/test_caffe_main.hpp"
-#include "caffe/test/test_gradient_check_util.hpp"
-
-namespace caffe {
-
-template <typename TypeParam>
-class ScaleLayerTest : public MultiDeviceTest<TypeParam> {
-  typedef typename TypeParam::Dtype Dtype;
-
- protected:
-  ScaleLayerTest()
-      : blob_bottom_(new Blob<Dtype>(2, 3, 4, 5)),
-        blob_bottom_eltwise_(new Blob<Dtype>(2, 3, 4, 5)),
-        blob_bottom_broadcast_0_(new Blob<Dtype>()),
-        blob_bottom_broadcast_1_(new Blob<Dtype>()),
-        blob_bottom_broadcast_2_(new Blob<Dtype>()),
-        blob_bottom_scale_(new Blob<Dtype>(vector<int>())),
-        blob_top_(new Blob<Dtype>()) {
-    Caffe::set_random_seed(1701);
-    vector<int> broadcast_shape(2);
-    broadcast_shape[0] = 2; broadcast_shape[1] = 3;
-    this->blob_bottom_broadcast_0_->Reshape(broadcast_shape);
-    broadcast_shape[0] = 3; broadcast_shape[1] = 4;
-    this->blob_bottom_broadcast_1_->Reshape(broadcast_shape);
-    broadcast_shape[0] = 4; broadcast_shape[1] = 5;
-    this->blob_bottom_broadcast_2_->Reshape(broadcast_shape);
-    FillerParameter filler_param;
-    filler_param.set_min(1);
-    filler_param.set_max(10);
-    UniformFiller<Dtype> filler(filler_param);
-    filler.Fill(this->blob_bottom_);
-    filler.Fill(this->blob_bottom_eltwise_);
-    filler.Fill(this->blob_bottom_broadcast_0_);
-    filler.Fill(this->blob_bottom_broadcast_1_);
-    filler.Fill(this->blob_bottom_broadcast_2_);
-    filler.Fill(this->blob_bottom_scale_);
-    blob_bottom_vec_.push_back(blob_bottom_);
-    blob_top_vec_.push_back(blob_top_);
-  }
-  virtual ~ScaleLayerTest() {
-    delete blob_bottom_;
-    delete blob_bottom_eltwise_;
-    delete blob_bottom_broadcast_0_;
-    delete blob_bottom_broadcast_1_;
-    delete blob_bottom_broadcast_2_;
-    delete blob_bottom_scale_;
-    delete blob_top_;
-  }
-  Blob<Dtype>* const blob_bottom_;
-  Blob<Dtype>* const blob_bottom_eltwise_;
-  Blob<Dtype>* const blob_bottom_broadcast_0_;
-  Blob<Dtype>* const blob_bottom_broadcast_1_;
-  Blob<Dtype>* const blob_bottom_broadcast_2_;
-  Blob<Dtype>* const blob_bottom_scale_;
-  Blob<Dtype>* const blob_top_;
-  vector<Blob<Dtype>*> blob_bottom_vec_;
-  vector<Blob<Dtype>*> blob_top_vec_;
-};
-
-TYPED_TEST_CASE(ScaleLayerTest, TestDtypesAndDevices);
-
-TYPED_TEST(ScaleLayerTest, TestForwardEltwise) {
-  typedef typename TypeParam::Dtype Dtype;
-  this->blob_bottom_vec_.push_back(this->blob_bottom_eltwise_);
-  LayerParameter layer_param;
-  layer_param.mutable_scale_param()->set_axis(0);
-  shared_ptr<ScaleLayer<Dtype> > layer(new ScaleLayer<Dtype>(layer_param));
-  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  ASSERT_EQ(this->blob_bottom_->shape(), this->blob_top_->shape());
-  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  const Dtype* data = this->blob_top_->cpu_data();
-  const int count = this->blob_top_->count();
-  const Dtype* in_data_a = this->blob_bottom_->cpu_data();
-  const Dtype* in_data_b = this->blob_bottom_eltwise_->cpu_data();
-  for (int i = 0; i < count; ++i) {
-    EXPECT_NEAR(data[i], in_data_a[i] * in_data_b[i], 1e-5);
-  }
-}
-
-TYPED_TEST(ScaleLayerTest, TestForwardEltwiseInPlace) {
-  typedef typename TypeParam::Dtype Dtype;
-  this->blob_top_vec_[0] = this->blob_bottom_;  // in-place computation
-  Blob<Dtype> orig_bottom(this->blob_bottom_->shape());
-  orig_bottom.CopyFrom(*this->blob_bottom_);
-  this->blob_bottom_vec_.push_back(this->blob_bottom_eltwise_);
-  LayerParameter layer_param;
-  layer_param.mutable_scale_param()->set_axis(0);
-  shared_ptr<ScaleLayer<Dtype> > layer(new ScaleLayer<Dtype>(layer_param));
-  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  const Dtype* data = this->blob_bottom_->cpu_data();
-  const int count = this->blob_bottom_->count();
-  const Dtype* in_data_a = orig_bottom.cpu_data();
-  const Dtype* in_data_b = this->blob_bottom_eltwise_->cpu_data();
-  for (int i = 0; i < count; ++i) {
-    EXPECT_NEAR(data[i], in_data_a[i] * in_data_b[i], 1e-5);
-  }
-}
-
-TYPED_TEST(ScaleLayerTest, TestBackwardEltwiseInPlace) {
-  typedef typename TypeParam::Dtype Dtype;
-  Blob<Dtype> orig_bottom(this->blob_bottom_->shape());
-  orig_bottom.CopyFrom(*this->blob_bottom_);
-  this->blob_bottom_vec_.push_back(this->blob_bottom_eltwise_);
-  LayerParameter layer_param;
-  layer_param.mutable_scale_param()->set_axis(0);
-  shared_ptr<ScaleLayer<Dtype> > layer(new ScaleLayer<Dtype>(layer_param));
-  Blob<Dtype> top_diff(this->blob_bottom_->shape());
-  FillerParameter filler_param;
-  filler_param.set_type("gaussian");
-  filler_param.set_std(1);
-  GaussianFiller<Dtype> filler(filler_param);
-  filler.Fill(&top_diff);
-  vector<bool> propagate_down(2, true);
-  // Run forward + backward without in-place computation;
-  // save resulting bottom diffs.
-  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  caffe_copy(top_diff.count(), top_diff.cpu_data(),
-             this->blob_top_->mutable_cpu_diff());
-  layer->Backward(this->blob_top_vec_, propagate_down, this->blob_bottom_vec_);
-  const bool kReshape = true;
-  const bool kCopyDiff = true;
-  Blob<Dtype> orig_bottom_diff;
-  orig_bottom_diff.CopyFrom(*this->blob_bottom_, kCopyDiff, kReshape);
-  Blob<Dtype> orig_scale_diff;
-  orig_scale_diff.CopyFrom(*this->blob_bottom_eltwise_,
-                            kCopyDiff, kReshape);
-  // Rerun forward + backward with in-place computation;
-  // check that resulting bottom diffs are the same.
-  this->blob_top_vec_[0] = this->blob_bottom_;  // in-place computation
-  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  caffe_copy(top_diff.count(), top_diff.cpu_data(),
-             this->blob_bottom_->mutable_cpu_diff());
-  layer->Backward(this->blob_top_vec_, propagate_down, this->blob_bottom_vec_);
-  for (int i = 0; i < this->blob_bottom_->count(); ++i) {
-    EXPECT_NEAR(orig_bottom_diff.cpu_diff()[i],
-                this->blob_bottom_->cpu_diff()[i], 1e-5);
-  }
-  for (int i = 0; i < this->blob_bottom_eltwise_->count(); ++i) {
-    EXPECT_NEAR(orig_scale_diff.cpu_diff()[i],
-                this->blob_bottom_eltwise_->cpu_diff()[i], 1e-5);
-  }
-}
-
-TYPED_TEST(ScaleLayerTest, TestForwardEltwiseWithParam) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  ScaleParameter* scale_param = layer_param.mutable_scale_param();
-  scale_param->set_axis(0);
-  scale_param->set_num_axes(-1);
-  scale_param->mutable_filler()->set_type("gaussian");
-  shared_ptr<ScaleLayer<Dtype> > layer(new ScaleLayer<Dtype>(layer_param));
-  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  ASSERT_EQ(this->blob_bottom_->shape(), this->blob_top_->shape());
-  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  const Dtype* data = this->blob_top_->cpu_data();
-  const int count = this->blob_top_->count();
-  const Dtype* in_data_a = this->blob_bottom_->cpu_data();
-  const Dtype* in_data_b = layer->blobs()[0]->cpu_data();
-  for (int i = 0; i < count; ++i) {
-    EXPECT_NEAR(data[i], in_data_a[i] * in_data_b[i], 1e-5);
-  }
-}
-
-TYPED_TEST(ScaleLayerTest, TestForwardBroadcastBegin) {
-  typedef typename TypeParam::Dtype Dtype;
-  this->blob_bottom_vec_.push_back(this->blob_bottom_broadcast_0_);
-  LayerParameter layer_param;
-  layer_param.mutable_scale_param()->set_axis(0);
-  shared_ptr<ScaleLayer<Dtype> > layer(new ScaleLayer<Dtype>(layer_param));
-  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  ASSERT_EQ(this->blob_bottom_->shape(), this->blob_top_->shape());
-  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  for (int n = 0; n < this->blob_bottom_->num(); ++n) {
-    for (int c = 0; c < this->blob_bottom_->channels(); ++c) {
-      for (int h = 0; h < this->blob_bottom_->height(); ++h) {
-        for (int w = 0; w < this->blob_bottom_->width(); ++w) {
-          EXPECT_NEAR(this->blob_top_->data_at(n, c, h, w),
-                      this->blob_bottom_->data_at(n, c, h, w) *
-                      this->blob_bottom_broadcast_0_->data_at(n, c, 0, 0),
-                      1e-5);
-        }
-      }
-    }
-  }
-}
-
-TYPED_TEST(ScaleLayerTest, TestForwardBroadcastMiddle) {
-  typedef typename TypeParam::Dtype Dtype;
-  this->blob_bottom_vec_.push_back(this->blob_bottom_broadcast_1_);
-  LayerParameter layer_param;
-  layer_param.mutable_scale_param()->set_axis(1);
-  shared_ptr<ScaleLayer<Dtype> > layer(new ScaleLayer<Dtype>(layer_param));
-  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  ASSERT_EQ(this->blob_bottom_->shape(), this->blob_top_->shape());
-  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  for (int n = 0; n < this->blob_bottom_->num(); ++n) {
-    for (int c = 0; c < this->blob_bottom_->channels(); ++c) {
-      for (int h = 0; h < this->blob_bottom_->height(); ++h) {
-        for (int w = 0; w < this->blob_bottom_->width(); ++w) {
-          EXPECT_NEAR(this->blob_top_->data_at(n, c, h, w),
-                      this->blob_bottom_->data_at(n, c, h, w) *
-                      this->blob_bottom_broadcast_1_->data_at(c, h, 0, 0),
-                      1e-5);
-        }
-      }
-    }
-  }
-}
-
-TYPED_TEST(ScaleLayerTest, TestForwardBroadcastMiddleInPlace) {
-  typedef typename TypeParam::Dtype Dtype;
-  this->blob_top_vec_[0] = this->blob_bottom_;  // in-place computation
-  Blob<Dtype> orig_bottom(this->blob_bottom_->shape());
-  orig_bottom.CopyFrom(*this->blob_bottom_);
-  this->blob_bottom_vec_.push_back(this->blob_bottom_broadcast_1_);
-  LayerParameter layer_param;
-  layer_param.mutable_scale_param()->set_axis(1);
-  shared_ptr<ScaleLayer<Dtype> > layer(new ScaleLayer<Dtype>(layer_param));
-  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  for (int n = 0; n < this->blob_bottom_->num(); ++n) {
-    for (int c = 0; c < this->blob_bottom_->channels(); ++c) {
-      for (int h = 0; h < this->blob_bottom_->height(); ++h) {
-        for (int w = 0; w < this->blob_bottom_->width(); ++w) {
-          EXPECT_NEAR(this->blob_bottom_->data_at(n, c, h, w),
-                      orig_bottom.data_at(n, c, h, w) *
-                      this->blob_bottom_broadcast_1_->data_at(c, h, 0, 0),
-                      1e-5);
-        }
-      }
-    }
-  }
-}
-
-TYPED_TEST(ScaleLayerTest, TestBackwardBroadcastMiddleInPlace) {
-  typedef typename TypeParam::Dtype Dtype;
-  Blob<Dtype> orig_bottom(this->blob_bottom_->shape());
-  orig_bottom.CopyFrom(*this->blob_bottom_);
-  this->blob_bottom_vec_.push_back(this->blob_bottom_broadcast_1_);
-  LayerParameter layer_param;
-  layer_param.mutable_scale_param()->set_axis(1);
-  shared_ptr<ScaleLayer<Dtype> > layer(new ScaleLayer<Dtype>(layer_param));
-  Blob<Dtype> top_diff(this->blob_bottom_->shape());
-  FillerParameter filler_param;
-  filler_param.set_type("gaussian");
-  filler_param.set_std(1);
-  GaussianFiller<Dtype> filler(filler_param);
-  filler.Fill(&top_diff);
-  vector<bool> propagate_down(2, true);
-  // Run forward + backward without in-place computation;
-  // save resulting bottom diffs.
-  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  caffe_copy(top_diff.count(), top_diff.cpu_data(),
-             this->blob_top_->mutable_cpu_diff());
-  layer->Backward(this->blob_top_vec_, propagate_down, this->blob_bottom_vec_);
-  const bool kReshape = true;
-  const bool kCopyDiff = true;
-  Blob<Dtype> orig_bottom_diff;
-  orig_bottom_diff.CopyFrom(*this->blob_bottom_, kCopyDiff, kReshape);
-  Blob<Dtype> orig_scale_diff;
-  orig_scale_diff.CopyFrom(*this->blob_bottom_broadcast_1_,
-                            kCopyDiff, kReshape);
-  // Rerun forward + backward with in-place computation;
-  // check that resulting bottom diffs are the same.
-  this->blob_top_vec_[0] = this->blob_bottom_;  // in-place computation
-  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  caffe_copy(top_diff.count(), top_diff.cpu_data(),
-             this->blob_bottom_->mutable_cpu_diff());
-  layer->Backward(this->blob_top_vec_, propagate_down, this->blob_bottom_vec_);
-  for (int i = 0; i < this->blob_bottom_->count(); ++i) {
-    EXPECT_NEAR(orig_bottom_diff.cpu_diff()[i],
-                this->blob_bottom_->cpu_diff()[i], 1e-5);
-  }
-  for (int i = 0; i < this->blob_bottom_broadcast_1_->count(); ++i) {
-    EXPECT_NEAR(orig_scale_diff.cpu_diff()[i],
-                this->blob_bottom_broadcast_1_->cpu_diff()[i], 1e-5);
-  }
-}
-
-TYPED_TEST(ScaleLayerTest, TestForwardBroadcastMiddleWithParam) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  ScaleParameter* scale_param = layer_param.mutable_scale_param();
-  scale_param->set_axis(1);
-  scale_param->set_num_axes(2);
-  scale_param->mutable_filler()->set_type("gaussian");
-  shared_ptr<ScaleLayer<Dtype> > layer(new ScaleLayer<Dtype>(layer_param));
-  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  ASSERT_EQ(this->blob_bottom_->shape(), this->blob_top_->shape());
-  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  for (int n = 0; n < this->blob_bottom_->num(); ++n) {
-    for (int c = 0; c < this->blob_bottom_->channels(); ++c) {
-      for (int h = 0; h < this->blob_bottom_->height(); ++h) {
-        for (int w = 0; w < this->blob_bottom_->width(); ++w) {
-          EXPECT_NEAR(this->blob_top_->data_at(n, c, h, w),
-                      this->blob_bottom_->data_at(n, c, h, w) *
-                      layer->blobs()[0]->data_at(c, h, 0, 0), 1e-5);
-        }
-      }
-    }
-  }
-}
-
-TYPED_TEST(ScaleLayerTest, TestForwardBroadcastMiddleWithParamAndBias) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  ScaleParameter* scale_param = layer_param.mutable_scale_param();
-  scale_param->set_axis(1);
-  scale_param->set_num_axes(2);
-  scale_param->mutable_filler()->set_type("gaussian");
-  scale_param->set_bias_term(true);
-  scale_param->mutable_bias_filler()->set_type("gaussian");
-  shared_ptr<ScaleLayer<Dtype> > layer(new ScaleLayer<Dtype>(layer_param));
-  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  ASSERT_EQ(this->blob_bottom_->shape(), this->blob_top_->shape());
-  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  for (int n = 0; n < this->blob_bottom_->num(); ++n) {
-    for (int c = 0; c < this->blob_bottom_->channels(); ++c) {
-      for (int h = 0; h < this->blob_bottom_->height(); ++h) {
-        for (int w = 0; w < this->blob_bottom_->width(); ++w) {
-          EXPECT_NEAR(this->blob_top_->data_at(n, c, h, w),
-                      this->blob_bottom_->data_at(n, c, h, w) *
-                      layer->blobs()[0]->data_at(c, h, 0, 0) +
-                      layer->blobs()[1]->data_at(c, h, 0, 0), 1e-5);
-        }
-      }
-    }
-  }
-}
-
-TYPED_TEST(ScaleLayerTest, TestForwardBroadcastEnd) {
-  typedef typename TypeParam::Dtype Dtype;
-  this->blob_bottom_vec_.push_back(this->blob_bottom_broadcast_2_);
-  LayerParameter layer_param;
-  layer_param.mutable_scale_param()->set_axis(2);
-  shared_ptr<ScaleLayer<Dtype> > layer(new ScaleLayer<Dtype>(layer_param));
-  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  ASSERT_EQ(this->blob_bottom_->shape(), this->blob_top_->shape());
-  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  for (int n = 0; n < this->blob_bottom_->num(); ++n) {
-    for (int c = 0; c < this->blob_bottom_->channels(); ++c) {
-      for (int h = 0; h < this->blob_bottom_->height(); ++h) {
-        for (int w = 0; w < this->blob_bottom_->width(); ++w) {
-          EXPECT_NEAR(this->blob_top_->data_at(n, c, h, w),
-                      this->blob_bottom_->data_at(n, c, h, w) *
-                      this->blob_bottom_broadcast_2_->data_at(h, w, 0, 0),
-                      1e-5);
-        }
-      }
-    }
-  }
-}
-
-TYPED_TEST(ScaleLayerTest, TestForwardScale) {
-  typedef typename TypeParam::Dtype Dtype;
-  this->blob_bottom_vec_.push_back(this->blob_bottom_scale_);
-  LayerParameter layer_param;
-  shared_ptr<ScaleLayer<Dtype> > layer(new ScaleLayer<Dtype>(layer_param));
-  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  ASSERT_EQ(this->blob_bottom_->shape(), this->blob_top_->shape());
-  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  const Dtype* data = this->blob_top_->cpu_data();
-  const int count = this->blob_top_->count();
-  const Dtype* in_data = this->blob_bottom_->cpu_data();
-  const Dtype scale = *this->blob_bottom_scale_->cpu_data();
-  for (int i = 0; i < count; ++i) {
-    EXPECT_NEAR(data[i], in_data[i] * scale, 1e-5);
-  }
-}
-
-TYPED_TEST(ScaleLayerTest, TestForwardScaleAxis2) {
-  typedef typename TypeParam::Dtype Dtype;
-  this->blob_bottom_vec_.push_back(this->blob_bottom_scale_);
-  LayerParameter layer_param;
-  layer_param.mutable_scale_param()->set_axis(2);
-  shared_ptr<ScaleLayer<Dtype> > layer(new ScaleLayer<Dtype>(layer_param));
-  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  ASSERT_EQ(this->blob_bottom_->shape(), this->blob_top_->shape());
-  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  const Dtype* data = this->blob_top_->cpu_data();
-  const int count = this->blob_top_->count();
-  const Dtype* in_data = this->blob_bottom_->cpu_data();
-  const Dtype scale = *this->blob_bottom_scale_->cpu_data();
-  for (int i = 0; i < count; ++i) {
-    EXPECT_NEAR(data[i], in_data[i] * scale, 1e-5);
-  }
-}
-
-TYPED_TEST(ScaleLayerTest, TestGradientEltwise) {
-  typedef typename TypeParam::Dtype Dtype;
-  this->blob_bottom_vec_.push_back(this->blob_bottom_eltwise_);
-  LayerParameter layer_param;
-  layer_param.mutable_scale_param()->set_axis(0);
-  ScaleLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-2, 1e-3);
-  checker.CheckGradientEltwise(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_);
-}
-
-TYPED_TEST(ScaleLayerTest, TestGradientEltwiseWithParam) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  ScaleParameter* scale_param = layer_param.mutable_scale_param();
-  scale_param->set_axis(0);
-  scale_param->set_num_axes(-1);
-  scale_param->mutable_filler()->set_type("gaussian");
-  ScaleLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-2, 1e-3);
-  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_);
-}
-
-TYPED_TEST(ScaleLayerTest, TestGradientBroadcastBegin) {
-  typedef typename TypeParam::Dtype Dtype;
-  this->blob_bottom_vec_.push_back(this->blob_bottom_broadcast_0_);
-  LayerParameter layer_param;
-  layer_param.mutable_scale_param()->set_axis(0);
-  ScaleLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-2, 1e-3);
-  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_);
-}
-
-TYPED_TEST(ScaleLayerTest, TestGradientBroadcastMiddle) {
-  typedef typename TypeParam::Dtype Dtype;
-  this->blob_bottom_vec_.push_back(this->blob_bottom_broadcast_1_);
-  LayerParameter layer_param;
-  layer_param.mutable_scale_param()->set_axis(1);
-  ScaleLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-2, 1e-3);
-  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_);
-}
-
-TYPED_TEST(ScaleLayerTest, TestGradientBroadcastMiddleWithParam) {
-  typedef typename TypeParam::Dtype Dtype;
-  this->blob_bottom_vec_.push_back(this->blob_bottom_broadcast_1_);
-  LayerParameter layer_param;
-  ScaleParameter* scale_param = layer_param.mutable_scale_param();
-  scale_param->set_axis(1);
-  scale_param->set_num_axes(2);
-  scale_param->mutable_filler()->set_type("gaussian");
-  ScaleLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-2, 1e-3);
-  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_);
-}
-
-TYPED_TEST(ScaleLayerTest, TestGradientBroadcastEnd) {
-  typedef typename TypeParam::Dtype Dtype;
-  this->blob_bottom_vec_.push_back(this->blob_bottom_broadcast_2_);
-  LayerParameter layer_param;
-  layer_param.mutable_scale_param()->set_axis(2);
-  ScaleLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-2, 1e-3);
-  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_);
-}
-
-TYPED_TEST(ScaleLayerTest, TestGradientScale) {
-  typedef typename TypeParam::Dtype Dtype;
-  this->blob_bottom_vec_.push_back(this->blob_bottom_scale_);
-  LayerParameter layer_param;
-  ScaleLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-2, 1e-3);
-  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_);
-}
-
-TYPED_TEST(ScaleLayerTest, TestGradientScaleAndBias) {
-  typedef typename TypeParam::Dtype Dtype;
-  this->blob_bottom_vec_.push_back(this->blob_bottom_scale_);
-  LayerParameter layer_param;
-  ScaleParameter* scale_param = layer_param.mutable_scale_param();
-  scale_param->set_bias_term(true);
-  scale_param->mutable_bias_filler()->set_type("gaussian");
-  ScaleLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-2, 1e-3);
-  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_);
-}
-
-TYPED_TEST(ScaleLayerTest, TestGradientScaleAxis2) {
-  typedef typename TypeParam::Dtype Dtype;
-  this->blob_bottom_vec_.push_back(this->blob_bottom_scale_);
-  LayerParameter layer_param;
-  layer_param.mutable_scale_param()->set_axis(2);
-  ScaleLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-2, 1e-3);
-  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_);
-}
-
-}  // namespace caffe
diff --git a/src/caffe/test/test_sigmoid_cross_entropy_loss_layer.cpp b/src/caffe/test/test_sigmoid_cross_entropy_loss_layer.cpp
deleted file mode 100644
index 5dfd765..0000000
--- a/src/caffe/test/test_sigmoid_cross_entropy_loss_layer.cpp
+++ /dev/null
@@ -1,120 +0,0 @@
-#include <cmath>
-#include <vector>
-
-#include "gtest/gtest.h"
-
-#include "caffe/blob.hpp"
-#include "caffe/common.hpp"
-#include "caffe/filler.hpp"
-#include "caffe/layers/sigmoid_cross_entropy_loss_layer.hpp"
-
-#include "caffe/test/test_caffe_main.hpp"
-#include "caffe/test/test_gradient_check_util.hpp"
-
-namespace caffe {
-
-template <typename TypeParam>
-class SigmoidCrossEntropyLossLayerTest : public MultiDeviceTest<TypeParam> {
-  typedef typename TypeParam::Dtype Dtype;
-
- protected:
-  SigmoidCrossEntropyLossLayerTest()
-      : blob_bottom_data_(new Blob<Dtype>(10, 5, 1, 1)),
-        blob_bottom_targets_(new Blob<Dtype>(10, 5, 1, 1)),
-        blob_top_loss_(new Blob<Dtype>()) {
-    // Fill the data vector
-    FillerParameter data_filler_param;
-    data_filler_param.set_std(1);
-    GaussianFiller<Dtype> data_filler(data_filler_param);
-    data_filler.Fill(blob_bottom_data_);
-    blob_bottom_vec_.push_back(blob_bottom_data_);
-    // Fill the targets vector
-    FillerParameter targets_filler_param;
-    targets_filler_param.set_min(0);
-    targets_filler_param.set_max(1);
-    UniformFiller<Dtype> targets_filler(targets_filler_param);
-    targets_filler.Fill(blob_bottom_targets_);
-    blob_bottom_vec_.push_back(blob_bottom_targets_);
-    blob_top_vec_.push_back(blob_top_loss_);
-  }
-  virtual ~SigmoidCrossEntropyLossLayerTest() {
-    delete blob_bottom_data_;
-    delete blob_bottom_targets_;
-    delete blob_top_loss_;
-  }
-
-  Dtype SigmoidCrossEntropyLossReference(const int count, const int num,
-                                         const Dtype* input,
-                                         const Dtype* target) {
-    Dtype loss = 0;
-    for (int i = 0; i < count; ++i) {
-      const Dtype prediction = 1 / (1 + exp(-input[i]));
-      EXPECT_LE(prediction, 1);
-      EXPECT_GE(prediction, 0);
-      EXPECT_LE(target[i], 1);
-      EXPECT_GE(target[i], 0);
-      loss -= target[i] * log(prediction + (target[i] == Dtype(0)));
-      loss -= (1 - target[i]) * log(1 - prediction + (target[i] == Dtype(1)));
-    }
-    return loss / num;
-  }
-
-  void TestForward() {
-    LayerParameter layer_param;
-    const Dtype kLossWeight = 3.7;
-    layer_param.add_loss_weight(kLossWeight);
-    FillerParameter data_filler_param;
-    data_filler_param.set_std(1);
-    GaussianFiller<Dtype> data_filler(data_filler_param);
-    FillerParameter targets_filler_param;
-    targets_filler_param.set_min(0.0);
-    targets_filler_param.set_max(1.0);
-    UniformFiller<Dtype> targets_filler(targets_filler_param);
-    Dtype eps = 2e-2;
-    for (int i = 0; i < 100; ++i) {
-      // Fill the data vector
-      data_filler.Fill(this->blob_bottom_data_);
-      // Fill the targets vector
-      targets_filler.Fill(this->blob_bottom_targets_);
-      SigmoidCrossEntropyLossLayer<Dtype> layer(layer_param);
-      layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-      Dtype layer_loss =
-          layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-      const int count = this->blob_bottom_data_->count();
-      const int num = this->blob_bottom_data_->num();
-      const Dtype* blob_bottom_data = this->blob_bottom_data_->cpu_data();
-      const Dtype* blob_bottom_targets =
-          this->blob_bottom_targets_->cpu_data();
-      Dtype reference_loss = kLossWeight * SigmoidCrossEntropyLossReference(
-          count, num, blob_bottom_data, blob_bottom_targets);
-      EXPECT_NEAR(reference_loss, layer_loss, eps) << "debug: trial #" << i;
-    }
-  }
-
-  Blob<Dtype>* const blob_bottom_data_;
-  Blob<Dtype>* const blob_bottom_targets_;
-  Blob<Dtype>* const blob_top_loss_;
-  vector<Blob<Dtype>*> blob_bottom_vec_;
-  vector<Blob<Dtype>*> blob_top_vec_;
-};
-
-TYPED_TEST_CASE(SigmoidCrossEntropyLossLayerTest, TestDtypesAndDevices);
-
-TYPED_TEST(SigmoidCrossEntropyLossLayerTest, TestSigmoidCrossEntropyLoss) {
-  this->TestForward();
-}
-
-TYPED_TEST(SigmoidCrossEntropyLossLayerTest, TestGradient) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  const Dtype kLossWeight = 3.7;
-  layer_param.add_loss_weight(kLossWeight);
-  SigmoidCrossEntropyLossLayer<Dtype> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  GradientChecker<Dtype> checker(1e-2, 1e-2, 1701);
-  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_, 0);
-}
-
-
-}  // namespace caffe
diff --git a/src/caffe/test/test_slice_layer.cpp b/src/caffe/test/test_slice_layer.cpp
deleted file mode 100644
index c2b231e..0000000
--- a/src/caffe/test/test_slice_layer.cpp
+++ /dev/null
@@ -1,215 +0,0 @@
-#include <vector>
-
-#include "gtest/gtest.h"
-
-#include "caffe/blob.hpp"
-#include "caffe/common.hpp"
-#include "caffe/filler.hpp"
-#include "caffe/layers/slice_layer.hpp"
-
-#include "caffe/test/test_caffe_main.hpp"
-#include "caffe/test/test_gradient_check_util.hpp"
-
-namespace caffe {
-
-template <typename TypeParam>
-class SliceLayerTest : public MultiDeviceTest<TypeParam> {
-  typedef typename TypeParam::Dtype Dtype;
-
- protected:
-  SliceLayerTest()
-      : blob_bottom_(new Blob<Dtype>(6, 12, 2, 3)),
-        blob_top_0_(new Blob<Dtype>()),
-        blob_top_1_(new Blob<Dtype>()),
-        blob_top_2_(new Blob<Dtype>()) {}
-  virtual void SetUp() {
-    // fill the values
-    Caffe::set_random_seed(1701);
-    FillerParameter filler_param;
-    GaussianFiller<Dtype> filler(filler_param);
-    filler.Fill(this->blob_bottom_);
-    blob_top_vec_0_.push_back(blob_top_0_);
-    blob_top_vec_0_.push_back(blob_top_1_);
-    blob_top_vec_1_.push_back(blob_top_0_);
-    blob_top_vec_1_.push_back(blob_top_1_);
-    blob_top_vec_1_.push_back(blob_top_2_);
-    blob_bottom_vec_.push_back(blob_bottom_);
-  }
-
-  virtual void ReduceBottomBlobSize() {
-    blob_bottom_->Reshape(4, 5, 2, 2);
-    FillerParameter filler_param;
-    GaussianFiller<Dtype> filler(filler_param);
-    filler.Fill(this->blob_bottom_);
-  }
-
-  virtual ~SliceLayerTest() {
-    delete blob_top_0_; delete blob_top_1_;
-    delete blob_top_2_; delete blob_bottom_;
-  }
-
-  Blob<Dtype>* const blob_bottom_;
-  Blob<Dtype>* const blob_top_0_;
-  Blob<Dtype>* const blob_top_1_;
-  Blob<Dtype>* const blob_top_2_;
-  vector<Blob<Dtype>*> blob_top_vec_0_, blob_top_vec_1_;
-  vector<Blob<Dtype>*> blob_bottom_vec_;
-};
-
-TYPED_TEST_CASE(SliceLayerTest, TestDtypesAndDevices);
-
-TYPED_TEST(SliceLayerTest, TestSetupNum) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  layer_param.mutable_slice_param()->set_axis(0);
-  SliceLayer<Dtype> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_1_);
-  EXPECT_EQ(this->blob_bottom_->num(), 3 * this->blob_top_0_->num());
-  EXPECT_EQ(this->blob_top_0_->num(), this->blob_top_1_->num());
-  EXPECT_EQ(this->blob_top_0_->num(), this->blob_top_2_->num());
-  EXPECT_EQ(this->blob_bottom_->channels(), this->blob_top_0_->channels());
-  EXPECT_EQ(this->blob_bottom_->height(), this->blob_top_0_->height());
-  EXPECT_EQ(this->blob_bottom_->width(), this->blob_top_0_->width());
-}
-
-TYPED_TEST(SliceLayerTest, TestSetupChannels) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  layer_param.mutable_slice_param()->add_slice_point(3);
-  SliceLayer<Dtype> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_0_);
-  EXPECT_EQ(this->blob_top_0_->num(), this->blob_bottom_->num());
-  EXPECT_EQ(this->blob_top_0_->channels(), 3);
-  EXPECT_EQ(this->blob_top_1_->channels(), 9);
-  EXPECT_EQ(this->blob_bottom_->channels(),
-    this->blob_top_0_->channels() + this->blob_top_1_->channels());
-  EXPECT_EQ(this->blob_bottom_->height(), this->blob_top_0_->height());
-  EXPECT_EQ(this->blob_bottom_->width(), this->blob_top_0_->width());
-}
-
-TYPED_TEST(SliceLayerTest, TestTrivialSlice) {
-  // Test the trivial (single output) "slice" operation --
-  // should be the identity.
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  SliceLayer<Dtype> layer(layer_param);
-  this->blob_top_vec_0_.resize(1);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_0_);
-  ASSERT_EQ(this->blob_bottom_->shape(), this->blob_top_0_->shape());
-  for (int i = 0; i < this->blob_bottom_->count(); ++i) {
-    EXPECT_EQ(this->blob_bottom_->cpu_data()[i],
-              this->blob_top_0_->cpu_data()[i]);
-  }
-}
-
-TYPED_TEST(SliceLayerTest, TestSliceAcrossNum) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  layer_param.mutable_slice_param()->set_axis(0);
-  SliceLayer<Dtype> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_0_);
-  const int top_num = this->blob_bottom_->num() / 2;
-  ASSERT_EQ(top_num, this->blob_top_0_->num());
-  ASSERT_EQ(top_num, this->blob_top_1_->num());
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_0_);
-  for (int n = 0; n < top_num; ++n) {
-    for (int c = 0; c < this->blob_top_0_->channels(); ++c) {
-      for (int h = 0; h < this->blob_bottom_->height(); ++h) {
-        for (int w = 0; w < this->blob_bottom_->width(); ++w) {
-          EXPECT_EQ(this->blob_bottom_->data_at(n, c, h, w),
-                    this->blob_top_0_->data_at(n, c, h, w));
-        }
-      }
-    }
-    for (int c = 0; c < this->blob_top_1_->channels(); ++c) {
-      for (int h = 0; h < this->blob_bottom_->height(); ++h) {
-        for (int w = 0; w < this->blob_bottom_->width(); ++w) {
-          EXPECT_EQ(this->blob_bottom_->data_at(n + 3, c, h, w),
-                    this->blob_top_1_->data_at(n, c, h, w));
-        }
-      }
-    }
-  }
-}
-
-TYPED_TEST(SliceLayerTest, TestSliceAcrossChannels) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  // Slice at 2, 8: should produce output blobs with #channels 2, 6, 4.
-  const int kSlicePoint0 = 2;
-  const int kSlicePoint1 = 8;
-  layer_param.mutable_slice_param()->add_slice_point(kSlicePoint0);
-  layer_param.mutable_slice_param()->add_slice_point(kSlicePoint1);
-  SliceLayer<Dtype> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_1_);
-  ASSERT_EQ(kSlicePoint0, this->blob_top_0_->channels());
-  ASSERT_EQ(kSlicePoint1 - kSlicePoint0, this->blob_top_1_->channels());
-  ASSERT_EQ(this->blob_bottom_->channels() - kSlicePoint1,
-            this->blob_top_2_->channels());
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_1_);
-  for (int n = 0; n < this->blob_bottom_->num(); ++n) {
-    for (int c = 0; c < this->blob_top_0_->channels(); ++c) {
-      for (int h = 0; h < this->blob_bottom_->height(); ++h) {
-        for (int w = 0; w < this->blob_bottom_->width(); ++w) {
-          EXPECT_EQ(this->blob_bottom_->data_at(n, c, h, w),
-              this->blob_top_0_->data_at(n, c, h, w));
-        }
-      }
-    }
-    for (int c = 0; c < this->blob_top_1_->channels(); ++c) {
-      for (int h = 0; h < this->blob_bottom_->height(); ++h) {
-        for (int w = 0; w < this->blob_bottom_->width(); ++w) {
-          EXPECT_EQ(this->blob_bottom_->data_at(n, c + kSlicePoint0, h, w),
-              this->blob_top_1_->data_at(n, c, h, w));
-        }
-      }
-    }
-    for (int c = 0; c < this->blob_top_2_->channels(); ++c) {
-      for (int h = 0; h < this->blob_bottom_->height(); ++h) {
-        for (int w = 0; w < this->blob_bottom_->width(); ++w) {
-          EXPECT_EQ(this->blob_bottom_->data_at(n, c + kSlicePoint1, h, w),
-              this->blob_top_2_->data_at(n, c, h, w));
-        }
-      }
-    }
-  }
-}
-
-TYPED_TEST(SliceLayerTest, TestGradientTrivial) {
-  // Test the trivial (single output) "slice" operation --
-  // should be the identity.
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  SliceLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-2, 1e-3);
-  this->blob_top_vec_0_.resize(1);
-  checker.CheckGradientEltwise(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_0_);
-}
-
-TYPED_TEST(SliceLayerTest, TestGradientAcrossNum) {
-  typedef typename TypeParam::Dtype Dtype;
-  // Gradient checks are slow; reduce blob size.
-  this->ReduceBottomBlobSize();
-  LayerParameter layer_param;
-  layer_param.mutable_slice_param()->set_axis(0);
-  SliceLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-2, 1e-3);
-  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-    this->blob_top_vec_0_);
-}
-
-TYPED_TEST(SliceLayerTest, TestGradientAcrossChannels) {
-  typedef typename TypeParam::Dtype Dtype;
-  // Gradient checks are slow; reduce blob size.
-  this->ReduceBottomBlobSize();
-  LayerParameter layer_param;
-  const int kSlicePoint = 4;
-  layer_param.mutable_slice_param()->add_slice_point(kSlicePoint);
-  SliceLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-2, 1e-3);
-  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-    this->blob_top_vec_0_);
-}
-
-}  // namespace caffe
diff --git a/src/caffe/test/test_softmax_layer.cpp b/src/caffe/test/test_softmax_layer.cpp
deleted file mode 100644
index 9444357..0000000
--- a/src/caffe/test/test_softmax_layer.cpp
+++ /dev/null
@@ -1,152 +0,0 @@
-#include <cmath>
-#include <vector>
-
-#include "gtest/gtest.h"
-
-#include "caffe/blob.hpp"
-#include "caffe/common.hpp"
-#include "caffe/filler.hpp"
-#include "caffe/layers/softmax_layer.hpp"
-
-#ifdef USE_CUDNN
-#include "caffe/layers/cudnn_softmax_layer.hpp"
-#endif
-
-#include "caffe/test/test_caffe_main.hpp"
-#include "caffe/test/test_gradient_check_util.hpp"
-
-namespace caffe {
-
-template <typename TypeParam>
-class SoftmaxLayerTest : public MultiDeviceTest<TypeParam> {
-  typedef typename TypeParam::Dtype Dtype;
- protected:
-  SoftmaxLayerTest()
-      : blob_bottom_(new Blob<Dtype>(2, 10, 2, 3)),
-        blob_top_(new Blob<Dtype>()) {
-    // fill the values
-    FillerParameter filler_param;
-    GaussianFiller<Dtype> filler(filler_param);
-    filler.Fill(this->blob_bottom_);
-    blob_bottom_vec_.push_back(blob_bottom_);
-    blob_top_vec_.push_back(blob_top_);
-  }
-  virtual ~SoftmaxLayerTest() { delete blob_bottom_; delete blob_top_; }
-  Blob<Dtype>* const blob_bottom_;
-  Blob<Dtype>* const blob_top_;
-  vector<Blob<Dtype>*> blob_bottom_vec_;
-  vector<Blob<Dtype>*> blob_top_vec_;
-};
-
-TYPED_TEST_CASE(SoftmaxLayerTest, TestDtypesAndDevices);
-
-TYPED_TEST(SoftmaxLayerTest, TestForward) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  SoftmaxLayer<Dtype> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  // Test sum
-  for (int i = 0; i < this->blob_bottom_->num(); ++i) {
-    for (int k = 0; k < this->blob_bottom_->height(); ++k) {
-      for (int l = 0; l < this->blob_bottom_->width(); ++l) {
-        Dtype sum = 0;
-        for (int j = 0; j < this->blob_top_->channels(); ++j) {
-          sum += this->blob_top_->data_at(i, j, k, l);
-        }
-        EXPECT_GE(sum, 0.999);
-        EXPECT_LE(sum, 1.001);
-        // Test exact values
-        Dtype scale = 0;
-        for (int j = 0; j < this->blob_bottom_->channels(); ++j) {
-          scale += exp(this->blob_bottom_->data_at(i, j, k, l));
-        }
-        for (int j = 0; j < this->blob_bottom_->channels(); ++j) {
-          EXPECT_GE(this->blob_top_->data_at(i, j, k, l) + 1e-4,
-              exp(this->blob_bottom_->data_at(i, j, k, l)) / scale)
-              << "debug: " << i << " " << j;
-          EXPECT_LE(this->blob_top_->data_at(i, j, k, l) - 1e-4,
-              exp(this->blob_bottom_->data_at(i, j, k, l)) / scale)
-              << "debug: " << i << " " << j;
-        }
-      }
-    }
-  }
-}
-
-TYPED_TEST(SoftmaxLayerTest, TestGradient) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  SoftmaxLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-2, 1e-3);
-  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_);
-}
-
-#ifdef USE_CUDNN
-template <typename Dtype>
-class CuDNNSoftmaxLayerTest : public GPUDeviceTest<Dtype> {
- protected:
-  CuDNNSoftmaxLayerTest()
-      : blob_bottom_(new Blob<Dtype>(2, 10, 2, 3)),
-        blob_top_(new Blob<Dtype>()) {
-    // fill the values
-    FillerParameter filler_param;
-    GaussianFiller<Dtype> filler(filler_param);
-    filler.Fill(this->blob_bottom_);
-    blob_bottom_vec_.push_back(blob_bottom_);
-    blob_top_vec_.push_back(blob_top_);
-  }
-  virtual ~CuDNNSoftmaxLayerTest() { delete blob_bottom_; delete blob_top_; }
-  Blob<Dtype>* const blob_bottom_;
-  Blob<Dtype>* const blob_top_;
-  vector<Blob<Dtype>*> blob_bottom_vec_;
-  vector<Blob<Dtype>*> blob_top_vec_;
-};
-
-TYPED_TEST_CASE(CuDNNSoftmaxLayerTest, TestDtypes);
-
-TYPED_TEST(CuDNNSoftmaxLayerTest, TestForwardCuDNN) {
-  LayerParameter layer_param;
-  CuDNNSoftmaxLayer<TypeParam> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  // Test sum
-  for (int i = 0; i < this->blob_bottom_->num(); ++i) {
-    for (int k = 0; k < this->blob_bottom_->height(); ++k) {
-      for (int l = 0; l < this->blob_bottom_->width(); ++l) {
-        TypeParam sum = 0;
-        for (int j = 0; j < this->blob_top_->channels(); ++j) {
-          sum += this->blob_top_->data_at(i, j, k, l);
-        }
-        EXPECT_GE(sum, 0.999);
-        EXPECT_LE(sum, 1.001);
-        // Test exact values
-        TypeParam scale = 0;
-        for (int j = 0; j < this->blob_bottom_->channels(); ++j) {
-          scale += exp(this->blob_bottom_->data_at(i, j, k, l));
-        }
-        for (int j = 0; j < this->blob_bottom_->channels(); ++j) {
-          EXPECT_GE(this->blob_top_->data_at(i, j, k, l) + 1e-4,
-              exp(this->blob_bottom_->data_at(i, j, k, l)) / scale)
-              << "debug: " << i << " " << j;
-          EXPECT_LE(this->blob_top_->data_at(i, j, k, l) - 1e-4,
-              exp(this->blob_bottom_->data_at(i, j, k, l)) / scale)
-              << "debug: " << i << " " << j;
-        }
-      }
-    }
-  }
-}
-
-TYPED_TEST(CuDNNSoftmaxLayerTest, TestGradientCuDNN) {
-  LayerParameter layer_param;
-  CuDNNSoftmaxLayer<TypeParam> layer(layer_param);
-  GradientChecker<TypeParam> checker(1e-2, 1e-3);
-  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_);
-}
-
-#endif
-
-}  // namespace caffe
diff --git a/src/caffe/test/test_softmax_with_loss_layer.cpp b/src/caffe/test/test_softmax_with_loss_layer.cpp
deleted file mode 100644
index c67f3e0..0000000
--- a/src/caffe/test/test_softmax_with_loss_layer.cpp
+++ /dev/null
@@ -1,108 +0,0 @@
-#include <cmath>
-#include <vector>
-
-#include "boost/scoped_ptr.hpp"
-#include "gtest/gtest.h"
-
-#include "caffe/blob.hpp"
-#include "caffe/common.hpp"
-#include "caffe/filler.hpp"
-#include "caffe/layers/softmax_loss_layer.hpp"
-
-#include "caffe/test/test_caffe_main.hpp"
-#include "caffe/test/test_gradient_check_util.hpp"
-
-using boost::scoped_ptr;
-
-namespace caffe {
-
-template <typename TypeParam>
-class SoftmaxWithLossLayerTest : public MultiDeviceTest<TypeParam> {
-  typedef typename TypeParam::Dtype Dtype;
-
- protected:
-  SoftmaxWithLossLayerTest()
-      : blob_bottom_data_(new Blob<Dtype>(10, 5, 2, 3)),
-        blob_bottom_label_(new Blob<Dtype>(10, 1, 2, 3)),
-        blob_top_loss_(new Blob<Dtype>()) {
-    // fill the values
-    FillerParameter filler_param;
-    filler_param.set_std(10);
-    GaussianFiller<Dtype> filler(filler_param);
-    filler.Fill(this->blob_bottom_data_);
-    blob_bottom_vec_.push_back(blob_bottom_data_);
-    for (int i = 0; i < blob_bottom_label_->count(); ++i) {
-      blob_bottom_label_->mutable_cpu_data()[i] = caffe_rng_rand() % 5;
-    }
-    blob_bottom_vec_.push_back(blob_bottom_label_);
-    blob_top_vec_.push_back(blob_top_loss_);
-  }
-  virtual ~SoftmaxWithLossLayerTest() {
-    delete blob_bottom_data_;
-    delete blob_bottom_label_;
-    delete blob_top_loss_;
-  }
-  Blob<Dtype>* const blob_bottom_data_;
-  Blob<Dtype>* const blob_bottom_label_;
-  Blob<Dtype>* const blob_top_loss_;
-  vector<Blob<Dtype>*> blob_bottom_vec_;
-  vector<Blob<Dtype>*> blob_top_vec_;
-};
-
-TYPED_TEST_CASE(SoftmaxWithLossLayerTest, TestDtypesAndDevices);
-
-TYPED_TEST(SoftmaxWithLossLayerTest, TestGradient) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  layer_param.add_loss_weight(3);
-  SoftmaxWithLossLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-2, 1e-2, 1701);
-  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_, 0);
-}
-
-TYPED_TEST(SoftmaxWithLossLayerTest, TestForwardIgnoreLabel) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  layer_param.mutable_loss_param()->set_normalize(false);
-  // First, compute the loss with all labels
-  scoped_ptr<SoftmaxWithLossLayer<Dtype> > layer(
-      new SoftmaxWithLossLayer<Dtype>(layer_param));
-  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  Dtype full_loss = this->blob_top_loss_->cpu_data()[0];
-  // Now, accumulate the loss, ignoring each label in {0, ..., 4} in turn.
-  Dtype accum_loss = 0;
-  for (int label = 0; label < 5; ++label) {
-    layer_param.mutable_loss_param()->set_ignore_label(label);
-    layer.reset(new SoftmaxWithLossLayer<Dtype>(layer_param));
-    layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-    layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-    accum_loss += this->blob_top_loss_->cpu_data()[0];
-  }
-  // Check that each label was included all but once.
-  EXPECT_NEAR(4 * full_loss, accum_loss, 1e-4);
-}
-
-TYPED_TEST(SoftmaxWithLossLayerTest, TestGradientIgnoreLabel) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  // labels are in {0, ..., 4}, so we'll ignore about a fifth of them
-  layer_param.mutable_loss_param()->set_ignore_label(0);
-  SoftmaxWithLossLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-2, 1e-2, 1701);
-  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_, 0);
-}
-
-TYPED_TEST(SoftmaxWithLossLayerTest, TestGradientUnnormalized) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  layer_param.mutable_loss_param()->set_normalize(false);
-  SoftmaxWithLossLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-2, 1e-2, 1701);
-  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_, 0);
-}
-
-}  // namespace caffe
diff --git a/src/caffe/test/test_solver.cpp b/src/caffe/test/test_solver.cpp
deleted file mode 100644
index b181642..0000000
--- a/src/caffe/test/test_solver.cpp
+++ /dev/null
@@ -1,109 +0,0 @@
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "google/protobuf/text_format.h"
-#include "gtest/gtest.h"
-
-#include "caffe/common.hpp"
-#include "caffe/proto/caffe.pb.h"
-#include "caffe/sgd_solvers.hpp"
-#include "caffe/solver.hpp"
-
-#include "caffe/test/test_caffe_main.hpp"
-
-using std::ostringstream;
-
-namespace caffe {
-
-template <typename TypeParam>
-class SolverTest : public MultiDeviceTest<TypeParam> {
-  typedef typename TypeParam::Dtype Dtype;
-
- protected:
-  virtual void InitSolverFromProtoString(const string& proto) {
-    SolverParameter param;
-    CHECK(google::protobuf::TextFormat::ParseFromString(proto, &param));
-    // Set the solver_mode according to current Caffe::mode.
-    switch (Caffe::mode()) {
-      case Caffe::CPU:
-        param.set_solver_mode(SolverParameter_SolverMode_CPU);
-        break;
-      case Caffe::GPU:
-        param.set_solver_mode(SolverParameter_SolverMode_GPU);
-        break;
-      default:
-        LOG(FATAL) << "Unknown Caffe mode: " << Caffe::mode();
-    }
-    solver_.reset(new SGDSolver<Dtype>(param));
-  }
-
-  shared_ptr<Solver<Dtype> > solver_;
-};
-
-TYPED_TEST_CASE(SolverTest, TestDtypesAndDevices);
-
-TYPED_TEST(SolverTest, TestInitTrainTestNets) {
-  const string& proto =
-     "test_interval: 10 "
-     "test_iter: 10 "
-     "test_state: { stage: 'with-softmax' }"
-     "test_iter: 10 "
-     "test_state: {}"
-     "net_param { "
-     "  name: 'TestNetwork' "
-     "  layer { "
-     "    name: 'data' "
-     "    type: 'DummyData' "
-     "    dummy_data_param { "
-     "      shape { "
-     "        dim: 5 "
-     "        dim: 2 "
-     "        dim: 3 "
-     "        dim: 4 "
-     "      } "
-     "      shape { "
-     "        dim: 5 "
-     "      } "
-     "    } "
-     "    top: 'data' "
-     "    top: 'label' "
-     "  } "
-     "  layer { "
-     "    name: 'innerprod' "
-     "    type: 'InnerProduct' "
-     "    inner_product_param { "
-     "      num_output: 10 "
-     "    } "
-     "    bottom: 'data' "
-     "    top: 'innerprod' "
-     "  } "
-     "  layer { "
-     "    name: 'accuracy' "
-     "    type: 'Accuracy' "
-     "    bottom: 'innerprod' "
-     "    bottom: 'label' "
-     "    top: 'accuracy' "
-     "    exclude: { phase: TRAIN } "
-     "  } "
-     "  layer { "
-     "    name: 'loss' "
-     "    type: 'SoftmaxWithLoss' "
-     "    bottom: 'innerprod' "
-     "    bottom: 'label' "
-     "    include: { phase: TRAIN } "
-     "    include: { phase: TEST stage: 'with-softmax' } "
-     "  } "
-     "} ";
-  this->InitSolverFromProtoString(proto);
-  ASSERT_TRUE(this->solver_->net() != NULL);
-  EXPECT_TRUE(this->solver_->net()->has_layer("loss"));
-  EXPECT_FALSE(this->solver_->net()->has_layer("accuracy"));
-  ASSERT_EQ(2, this->solver_->test_nets().size());
-  EXPECT_TRUE(this->solver_->test_nets()[0]->has_layer("loss"));
-  EXPECT_TRUE(this->solver_->test_nets()[0]->has_layer("accuracy"));
-  EXPECT_FALSE(this->solver_->test_nets()[1]->has_layer("loss"));
-  EXPECT_TRUE(this->solver_->test_nets()[1]->has_layer("accuracy"));
-}
-
-}  // namespace caffe
diff --git a/src/caffe/test/test_solver_factory.cpp b/src/caffe/test/test_solver_factory.cpp
deleted file mode 100644
index eef5290..0000000
--- a/src/caffe/test/test_solver_factory.cpp
+++ /dev/null
@@ -1,50 +0,0 @@
-#include <map>
-#include <string>
-
-#include "boost/scoped_ptr.hpp"
-#include "google/protobuf/text_format.h"
-#include "gtest/gtest.h"
-
-#include "caffe/common.hpp"
-#include "caffe/solver.hpp"
-#include "caffe/solver_factory.hpp"
-
-#include "caffe/test/test_caffe_main.hpp"
-
-namespace caffe {
-
-template <typename TypeParam>
-class SolverFactoryTest : public MultiDeviceTest<TypeParam> {
- protected:
-  SolverParameter simple_solver_param() {
-    const string solver_proto =
-        "train_net_param { "
-        "  layer { "
-        "    name: 'data' type: 'DummyData' top: 'data' "
-        "    dummy_data_param { shape { dim: 1 } } "
-        "  } "
-        "} ";
-    SolverParameter solver_param;
-    CHECK(google::protobuf::TextFormat::ParseFromString(
-        solver_proto, &solver_param));
-    return solver_param;
-  }
-};
-
-TYPED_TEST_CASE(SolverFactoryTest, TestDtypesAndDevices);
-
-TYPED_TEST(SolverFactoryTest, TestCreateSolver) {
-  typedef typename TypeParam::Dtype Dtype;
-  typename SolverRegistry<Dtype>::CreatorRegistry& registry =
-      SolverRegistry<Dtype>::Registry();
-  shared_ptr<Solver<Dtype> > solver;
-  SolverParameter solver_param = this->simple_solver_param();
-  for (typename SolverRegistry<Dtype>::CreatorRegistry::iterator iter =
-       registry.begin(); iter != registry.end(); ++iter) {
-    solver_param.set_type(iter->first);
-    solver.reset(SolverRegistry<Dtype>::CreateSolver(solver_param));
-    EXPECT_EQ(iter->first, solver->type());
-  }
-}
-
-}  // namespace caffe
diff --git a/src/caffe/test/test_split_layer.cpp b/src/caffe/test/test_split_layer.cpp
deleted file mode 100644
index 0071421..0000000
--- a/src/caffe/test/test_split_layer.cpp
+++ /dev/null
@@ -1,983 +0,0 @@
-#include <string>
-#include <vector>
-
-#include "google/protobuf/text_format.h"
-#include "gtest/gtest.h"
-
-#include "caffe/blob.hpp"
-#include "caffe/common.hpp"
-#include "caffe/filler.hpp"
-#include "caffe/layers/split_layer.hpp"
-#include "caffe/proto/caffe.pb.h"
-#include "caffe/util/insert_splits.hpp"
-
-#include "caffe/test/test_caffe_main.hpp"
-#include "caffe/test/test_gradient_check_util.hpp"
-
-namespace caffe {
-
-template <typename TypeParam>
-class SplitLayerTest : public MultiDeviceTest<TypeParam> {
-  typedef typename TypeParam::Dtype Dtype;
-
- protected:
-  SplitLayerTest()
-      : blob_bottom_(new Blob<Dtype>(2, 3, 6, 5)),
-        blob_top_a_(new Blob<Dtype>()),
-        blob_top_b_(new Blob<Dtype>()) {
-    // fill the values
-    FillerParameter filler_param;
-    GaussianFiller<Dtype> filler(filler_param);
-    filler.Fill(this->blob_bottom_);
-    blob_bottom_vec_.push_back(blob_bottom_);
-    blob_top_vec_.push_back(blob_top_a_);
-    blob_top_vec_.push_back(blob_top_b_);
-  }
-  virtual ~SplitLayerTest() {
-    delete blob_bottom_;
-    delete blob_top_a_;
-    delete blob_top_b_;
-  }
-  Blob<Dtype>* const blob_bottom_;
-  Blob<Dtype>* const blob_top_a_;
-  Blob<Dtype>* const blob_top_b_;
-  vector<Blob<Dtype>*> blob_bottom_vec_;
-  vector<Blob<Dtype>*> blob_top_vec_;
-};
-
-TYPED_TEST_CASE(SplitLayerTest, TestDtypesAndDevices);
-
-TYPED_TEST(SplitLayerTest, TestSetup) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  SplitLayer<Dtype> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  EXPECT_EQ(this->blob_top_a_->num(), 2);
-  EXPECT_EQ(this->blob_top_a_->channels(), 3);
-  EXPECT_EQ(this->blob_top_a_->height(), 6);
-  EXPECT_EQ(this->blob_top_a_->width(), 5);
-  EXPECT_EQ(this->blob_top_b_->num(), 2);
-  EXPECT_EQ(this->blob_top_b_->channels(), 3);
-  EXPECT_EQ(this->blob_top_b_->height(), 6);
-  EXPECT_EQ(this->blob_top_b_->width(), 5);
-}
-
-TYPED_TEST(SplitLayerTest, Test) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  SplitLayer<Dtype> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  for (int i = 0; i < this->blob_bottom_->count(); ++i) {
-    Dtype bottom_value = this->blob_bottom_->cpu_data()[i];
-    EXPECT_EQ(bottom_value, this->blob_top_a_->cpu_data()[i]);
-    EXPECT_EQ(bottom_value, this->blob_top_b_->cpu_data()[i]);
-  }
-}
-
-TYPED_TEST(SplitLayerTest, TestGradient) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  SplitLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-2, 1e-2);
-  checker.CheckGradientEltwise(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_);
-}
-
-
-class SplitLayerInsertionTest : public ::testing::Test {
- protected:
-  void RunInsertionTest(
-      const string& input_param_string, const string& output_param_string) {
-    // Test that InsertSplits called on the proto specified by
-    // input_param_string results in the proto specified by
-    // output_param_string.
-    NetParameter input_param;
-    CHECK(google::protobuf::TextFormat::ParseFromString(
-        input_param_string, &input_param));
-    NetParameter expected_output_param;
-    CHECK(google::protobuf::TextFormat::ParseFromString(
-        output_param_string, &expected_output_param));
-    NetParameter actual_output_param;
-    InsertSplits(input_param, &actual_output_param);
-    EXPECT_EQ(expected_output_param.DebugString(),
-        actual_output_param.DebugString());
-    // Also test idempotence.
-    NetParameter double_split_insert_param;
-    InsertSplits(actual_output_param, &double_split_insert_param);
-    EXPECT_EQ(actual_output_param.DebugString(),
-       double_split_insert_param.DebugString());
-  }
-};
-
-TEST_F(SplitLayerInsertionTest, TestNoInsertion1) {
-  const string& input_proto =
-      "name: 'TestNetwork' "
-      "layer { "
-      "  name: 'data' "
-      "  type: 'Data' "
-      "  top: 'data' "
-      "  top: 'label' "
-      "} "
-      "layer { "
-      "  name: 'innerprod' "
-      "  type: 'InnerProduct' "
-      "  bottom: 'data' "
-      "  top: 'innerprod' "
-      "} "
-      "layer { "
-      "  name: 'loss' "
-      "  type: 'SoftmaxWithLoss' "
-      "  bottom: 'innerprod' "
-      "  bottom: 'label' "
-      "} ";
-  this->RunInsertionTest(input_proto, input_proto);
-}
-
-TEST_F(SplitLayerInsertionTest, TestNoInsertion2) {
-  const string& input_proto =
-      "name: 'TestNetwork' "
-      "layer { "
-      "  name: 'data' "
-      "  type: 'Data' "
-      "  top: 'data' "
-      "  top: 'label' "
-      "} "
-      "layer { "
-      "  name: 'data_split' "
-      "  type: 'Split' "
-      "  bottom: 'data' "
-      "  top: 'data_split_0' "
-      "  top: 'data_split_1' "
-      "} "
-      "layer { "
-      "  name: 'innerprod1' "
-      "  type: 'InnerProduct' "
-      "  bottom: 'data_split_0' "
-      "  top: 'innerprod1' "
-      "} "
-      "layer { "
-      "  name: 'innerprod2' "
-      "  type: 'InnerProduct' "
-      "  bottom: 'data_split_1' "
-      "  top: 'innerprod2' "
-      "} "
-      "layer { "
-      "  name: 'loss' "
-      "  type: 'EuclideanLoss' "
-      "  bottom: 'innerprod1' "
-      "  bottom: 'innerprod2' "
-      "} ";
-  this->RunInsertionTest(input_proto, input_proto);
-}
-
-TEST_F(SplitLayerInsertionTest, TestNoInsertionImageNet) {
-  const string& input_proto =
-      "name: 'CaffeNet' "
-      "layer { "
-      "  name: 'data' "
-      "  type: 'Data' "
-      "  data_param { "
-      "    source: '/home/jiayq/Data/ILSVRC12/train-leveldb' "
-      "    batch_size: 256 "
-      "  } "
-      "  transform_param { "
-      "    crop_size: 227 "
-      "    mirror: true "
-      "    mean_file: '/home/jiayq/Data/ILSVRC12/image_mean.binaryproto' "
-      "  } "
-      "  top: 'data' "
-      "  top: 'label' "
-      "} "
-      "layer { "
-      "  name: 'conv1' "
-      "  type: 'Convolution' "
-      "  convolution_param { "
-      "    num_output: 96 "
-      "    kernel_size: 11 "
-      "    stride: 4 "
-      "    weight_filler { "
-      "      type: 'gaussian' "
-      "      std: 0.01 "
-      "    } "
-      "    bias_filler { "
-      "      type: 'constant' "
-      "      value: 0. "
-      "    } "
-      "  } "
-      "  param { "
-      "    lr_mult: 1 "
-      "    decay_mult: 1 "
-      "  } "
-      "  param { "
-      "    lr_mult: 2 "
-      "    decay_mult: 0 "
-      "  } "
-      "  bottom: 'data' "
-      "  top: 'conv1' "
-      "} "
-      "layer { "
-      "  name: 'relu1' "
-      "  type: 'ReLU' "
-      "  bottom: 'conv1' "
-      "  top: 'conv1' "
-      "} "
-      "layer { "
-      "  name: 'pool1' "
-      "  type: 'Pooling' "
-      "  pooling_param { "
-      "    pool: MAX "
-      "    kernel_size: 3 "
-      "    stride: 2 "
-      "  } "
-      "  bottom: 'conv1' "
-      "  top: 'pool1' "
-      "} "
-      "layer { "
-      "  name: 'norm1' "
-      "  type: 'LRN' "
-      "  lrn_param { "
-      "    local_size: 5 "
-      "    alpha: 0.0001 "
-      "    beta: 0.75 "
-      "  } "
-      "  bottom: 'pool1' "
-      "  top: 'norm1' "
-      "} "
-      "layer { "
-      "  name: 'conv2' "
-      "  type: 'Convolution' "
-      "  convolution_param { "
-      "    num_output: 256 "
-      "    group: 2 "
-      "    kernel_size: 5 "
-      "    pad: 2 "
-      "    weight_filler { "
-      "      type: 'gaussian' "
-      "      std: 0.01 "
-      "    } "
-      "    bias_filler { "
-      "      type: 'constant' "
-      "      value: 1. "
-      "    } "
-      "  } "
-      "  param { "
-      "    lr_mult: 1 "
-      "    decay_mult: 1 "
-      "  } "
-      "  param { "
-      "    lr_mult: 2 "
-      "    decay_mult: 0 "
-      "  } "
-      "  bottom: 'norm1' "
-      "  top: 'conv2' "
-      "} "
-      "layer { "
-      "  name: 'relu2' "
-      "  type: 'ReLU' "
-      "  bottom: 'conv2' "
-      "  top: 'conv2' "
-      "} "
-      "layer { "
-      "  name: 'pool2' "
-      "  type: 'Pooling' "
-      "  pooling_param { "
-      "    pool: MAX "
-      "    kernel_size: 3 "
-      "    stride: 2 "
-      "  } "
-      "  bottom: 'conv2' "
-      "  top: 'pool2' "
-      "} "
-      "layer { "
-      "  name: 'norm2' "
-      "  type: 'LRN' "
-      "  lrn_param { "
-      "    local_size: 5 "
-      "    alpha: 0.0001 "
-      "    beta: 0.75 "
-      "  } "
-      "  bottom: 'pool2' "
-      "  top: 'norm2' "
-      "} "
-      "layer { "
-      "  name: 'conv3' "
-      "  type: 'Convolution' "
-      "  convolution_param { "
-      "    num_output: 384 "
-      "    kernel_size: 3 "
-      "    pad: 1 "
-      "    weight_filler { "
-      "      type: 'gaussian' "
-      "      std: 0.01 "
-      "    } "
-      "    bias_filler { "
-      "      type: 'constant' "
-      "      value: 0. "
-      "    } "
-      "  } "
-      "  param { "
-      "    lr_mult: 1 "
-      "    decay_mult: 1 "
-      "  } "
-      "  param { "
-      "    lr_mult: 2 "
-      "    decay_mult: 0 "
-      "  } "
-      "  bottom: 'norm2' "
-      "  top: 'conv3' "
-      "} "
-      "layer { "
-      "  name: 'relu3' "
-      "  type: 'ReLU' "
-      "  bottom: 'conv3' "
-      "  top: 'conv3' "
-      "} "
-      "layer { "
-      "  name: 'conv4' "
-      "  type: 'Convolution' "
-      "  convolution_param { "
-      "    num_output: 384 "
-      "    group: 2 "
-      "    kernel_size: 3 "
-      "    pad: 1 "
-      "    weight_filler { "
-      "      type: 'gaussian' "
-      "      std: 0.01 "
-      "    } "
-      "    bias_filler { "
-      "      type: 'constant' "
-      "      value: 1. "
-      "    } "
-      "  } "
-      "  param { "
-      "    lr_mult: 1 "
-      "    decay_mult: 1 "
-      "  } "
-      "  param { "
-      "    lr_mult: 2 "
-      "    decay_mult: 0 "
-      "  } "
-      "  bottom: 'conv3' "
-      "  top: 'conv4' "
-      "} "
-      "layer { "
-      "  name: 'relu4' "
-      "  type: 'ReLU' "
-      "  bottom: 'conv4' "
-      "  top: 'conv4' "
-      "} "
-      "layer { "
-      "  name: 'conv5' "
-      "  type: 'Convolution' "
-      "  convolution_param { "
-      "    num_output: 256 "
-      "    group: 2 "
-      "    kernel_size: 3 "
-      "    pad: 1 "
-      "    weight_filler { "
-      "      type: 'gaussian' "
-      "      std: 0.01 "
-      "    } "
-      "    bias_filler { "
-      "      type: 'constant' "
-      "      value: 1. "
-      "    } "
-      "  } "
-      "  param { "
-      "    lr_mult: 1 "
-      "    decay_mult: 1 "
-      "  } "
-      "  param { "
-      "    lr_mult: 2 "
-      "    decay_mult: 0 "
-      "  } "
-      "  bottom: 'conv4' "
-      "  top: 'conv5' "
-      "} "
-      "layer { "
-      "  name: 'relu5' "
-      "  type: 'ReLU' "
-      "  bottom: 'conv5' "
-      "  top: 'conv5' "
-      "} "
-      "layer { "
-      "  name: 'pool5' "
-      "  type: 'Pooling' "
-      "  pooling_param { "
-      "    kernel_size: 3 "
-      "    pool: MAX "
-      "    stride: 2 "
-      "  } "
-      "  bottom: 'conv5' "
-      "  top: 'pool5' "
-      "} "
-      "layer { "
-      "  name: 'fc6' "
-      "  type: 'InnerProduct' "
-      "  inner_product_param { "
-      "    num_output: 4096 "
-      "    weight_filler { "
-      "      type: 'gaussian' "
-      "      std: 0.005 "
-      "    } "
-      "    bias_filler { "
-      "      type: 'constant' "
-      "      value: 1. "
-      "    } "
-      "  } "
-      "  param { "
-      "    lr_mult: 1 "
-      "    decay_mult: 1 "
-      "  } "
-      "  param { "
-      "    lr_mult: 2 "
-      "    decay_mult: 0 "
-      "  } "
-      "  bottom: 'pool5' "
-      "  top: 'fc6' "
-      "} "
-      "layer { "
-      "  name: 'relu6' "
-      "  type: 'ReLU' "
-      "  bottom: 'fc6' "
-      "  top: 'fc6' "
-      "} "
-      "layer { "
-      "  name: 'drop6' "
-      "  type: 'Dropout' "
-      "  dropout_param { "
-      "    dropout_ratio: 0.5 "
-      "  } "
-      "  bottom: 'fc6' "
-      "  top: 'fc6' "
-      "} "
-      "layer { "
-      "  name: 'fc7' "
-      "  type: 'InnerProduct' "
-      "  inner_product_param { "
-      "    num_output: 4096 "
-      "    weight_filler { "
-      "      type: 'gaussian' "
-      "      std: 0.005 "
-      "    } "
-      "    bias_filler { "
-      "      type: 'constant' "
-      "      value: 1. "
-      "    } "
-      "  } "
-      "  param { "
-      "    lr_mult: 1 "
-      "    decay_mult: 1 "
-      "  } "
-      "  param { "
-      "    lr_mult: 2 "
-      "    decay_mult: 0 "
-      "  } "
-      "  bottom: 'fc6' "
-      "  top: 'fc7' "
-      "} "
-      "layer { "
-      "  name: 'relu7' "
-      "  type: 'ReLU' "
-      "  bottom: 'fc7' "
-      "  top: 'fc7' "
-      "} "
-      "layer { "
-      "  name: 'drop7' "
-      "  type: 'Dropout' "
-      "  dropout_param { "
-      "    dropout_ratio: 0.5 "
-      "  } "
-      "  bottom: 'fc7' "
-      "  top: 'fc7' "
-      "} "
-      "layer { "
-      "  name: 'fc8' "
-      "  type: 'InnerProduct' "
-      "  inner_product_param { "
-      "    num_output: 1000 "
-      "    weight_filler { "
-      "      type: 'gaussian' "
-      "      std: 0.01 "
-      "    } "
-      "    bias_filler { "
-      "      type: 'constant' "
-      "      value: 0 "
-      "    } "
-      "  } "
-      "  param { "
-      "    lr_mult: 1 "
-      "    decay_mult: 1 "
-      "  } "
-      "  param { "
-      "    lr_mult: 2 "
-      "    decay_mult: 0 "
-      "  } "
-      "  bottom: 'fc7' "
-      "  top: 'fc8' "
-      "} "
-      "layer { "
-      "  name: 'loss' "
-      "  type: 'SoftmaxWithLoss' "
-      "  bottom: 'fc8' "
-      "  bottom: 'label' "
-      "} ";
-  this->RunInsertionTest(input_proto, input_proto);
-}
-
-TEST_F(SplitLayerInsertionTest, TestNoInsertionWithInPlace) {
-  const string& input_proto =
-      "name: 'TestNetwork' "
-      "layer { "
-      "  name: 'data' "
-      "  type: 'Data' "
-      "  top: 'data' "
-      "  top: 'label' "
-      "} "
-      "layer { "
-      "  name: 'innerprod' "
-      "  type: 'InnerProduct' "
-      "  bottom: 'data' "
-      "  top: 'innerprod' "
-      "} "
-      "layer { "
-      "  name: 'relu' "
-      "  type: 'ReLU' "
-      "  bottom: 'innerprod' "
-      "  top: 'innerprod' "
-      "} "
-      "layer { "
-      "  name: 'loss' "
-      "  type: 'SoftmaxWithLoss' "
-      "  bottom: 'innerprod' "
-      "  bottom: 'label' "
-      "} ";
-  this->RunInsertionTest(input_proto, input_proto);
-}
-
-TEST_F(SplitLayerInsertionTest, TestLossInsertion) {
-  const string& input_proto =
-      "name: 'UnsharedWeightsNetwork' "
-      "force_backward: true "
-      "layer { "
-      "  name: 'data' "
-      "  type: 'DummyData' "
-      "  dummy_data_param { "
-      "    num: 5 "
-      "    channels: 2 "
-      "    height: 3 "
-      "    width: 4 "
-      "    data_filler { "
-      "      type: 'gaussian' "
-      "      std: 0.01 "
-      "    } "
-      "  } "
-      "  top: 'data' "
-      "} "
-      "layer { "
-      "  name: 'innerproduct1' "
-      "  type: 'InnerProduct' "
-      "  inner_product_param { "
-      "    num_output: 10 "
-      "    bias_term: false "
-      "    weight_filler { "
-      "      type: 'gaussian' "
-      "      std: 10 "
-      "    } "
-      "  } "
-      "  param { name: 'unsharedweights1' } "
-      "  bottom: 'data' "
-      "  top: 'innerproduct1' "
-      "  loss_weight: 2.5 "
-      "} "
-      "layer { "
-      "  name: 'innerproduct2' "
-      "  type: 'InnerProduct' "
-      "  inner_product_param { "
-      "    num_output: 10 "
-      "    bias_term: false "
-      "    weight_filler { "
-      "      type: 'gaussian' "
-      "      std: 10 "
-      "    } "
-      "  } "
-      "  param { name: 'unsharedweights2' } "
-      "  bottom: 'data' "
-      "  top: 'innerproduct2' "
-      "} "
-      "layer { "
-      "  name: 'loss' "
-      "  type: 'EuclideanLoss' "
-      "  bottom: 'innerproduct1' "
-      "  bottom: 'innerproduct2' "
-      "} ";
-  const string& expected_output_proto =
-      "name: 'UnsharedWeightsNetwork' "
-      "force_backward: true "
-      "layer { "
-      "  name: 'data' "
-      "  type: 'DummyData' "
-      "  dummy_data_param { "
-      "    num: 5 "
-      "    channels: 2 "
-      "    height: 3 "
-      "    width: 4 "
-      "    data_filler { "
-      "      type: 'gaussian' "
-      "      std: 0.01 "
-      "    } "
-      "  } "
-      "  top: 'data' "
-      "} "
-      "layer { "
-      "  name: 'data_data_0_split' "
-      "  type: 'Split' "
-      "  bottom: 'data' "
-      "  top: 'data_data_0_split_0' "
-      "  top: 'data_data_0_split_1' "
-      "} "
-      "layer { "
-      "  name: 'innerproduct1' "
-      "  type: 'InnerProduct' "
-      "  inner_product_param { "
-      "    num_output: 10 "
-      "    bias_term: false "
-      "    weight_filler { "
-      "      type: 'gaussian' "
-      "      std: 10 "
-      "    } "
-      "  } "
-      "  param { name: 'unsharedweights1' } "
-      "  bottom: 'data_data_0_split_0' "
-      "  top: 'innerproduct1' "
-      "} "
-      "layer { "
-      "  name: 'innerproduct1_innerproduct1_0_split' "
-      "  type: 'Split' "
-      "  bottom: 'innerproduct1' "
-      "  top: 'innerproduct1_innerproduct1_0_split_0' "
-      "  top: 'innerproduct1_innerproduct1_0_split_1' "
-      "  loss_weight: 2.5 "
-      "  loss_weight: 0 "
-      "} "
-      "layer { "
-      "  name: 'innerproduct2' "
-      "  type: 'InnerProduct' "
-      "  inner_product_param { "
-      "    num_output: 10 "
-      "    bias_term: false "
-      "    weight_filler { "
-      "      type: 'gaussian' "
-      "      std: 10 "
-      "    } "
-      "  } "
-      "  param { name: 'unsharedweights2' } "
-      "  bottom: 'data_data_0_split_1' "
-      "  top: 'innerproduct2' "
-      "} "
-      "layer { "
-      "  name: 'loss' "
-      "  type: 'EuclideanLoss' "
-      "  bottom: 'innerproduct1_innerproduct1_0_split_1' "
-      "  bottom: 'innerproduct2' "
-      "} ";
-  this->RunInsertionTest(input_proto, expected_output_proto);
-}
-
-TEST_F(SplitLayerInsertionTest, TestInsertion) {
-  const string& input_proto =
-      "name: 'TestNetwork' "
-      "layer { "
-      "  name: 'data' "
-      "  type: 'Data' "
-      "  top: 'data' "
-      "  top: 'label' "
-      "} "
-      "layer { "
-      "  name: 'innerprod1' "
-      "  type: 'InnerProduct' "
-      "  bottom: 'data' "
-      "  top: 'innerprod1' "
-      "} "
-      "layer { "
-      "  name: 'innerprod2' "
-      "  type: 'InnerProduct' "
-      "  bottom: 'data' "
-      "  top: 'innerprod2' "
-      "} "
-      "layer { "
-      "  name: 'innerprod3' "
-      "  type: 'InnerProduct' "
-      "  bottom: 'data' "
-      "  top: 'innerprod3' "
-      "} "
-      "layer { "
-      "  name: 'loss1' "
-      "  type: 'EuclideanLoss' "
-      "  bottom: 'innerprod1' "
-      "  bottom: 'innerprod2' "
-      "} "
-      "layer { "
-      "  name: 'loss2' "
-      "  type: 'EuclideanLoss' "
-      "  bottom: 'innerprod2' "
-      "  bottom: 'innerprod3' "
-      "} ";
-  const string& expected_output_proto =
-      "name: 'TestNetwork' "
-      "layer { "
-      "  name: 'data' "
-      "  type: 'Data' "
-      "  top: 'data' "
-      "  top: 'label' "
-      "} "
-      "layer { "
-      "  name: 'data_data_0_split' "
-      "  type: 'Split' "
-      "  bottom: 'data' "
-      "  top: 'data_data_0_split_0' "
-      "  top: 'data_data_0_split_1' "
-      "  top: 'data_data_0_split_2' "
-      "} "
-      "layer { "
-      "  name: 'innerprod1' "
-      "  type: 'InnerProduct' "
-      "  bottom: 'data_data_0_split_0' "
-      "  top: 'innerprod1' "
-      "} "
-      "layer { "
-      "  name: 'innerprod2' "
-      "  type: 'InnerProduct' "
-      "  bottom: 'data_data_0_split_1' "
-      "  top: 'innerprod2' "
-      "} "
-      "layer { "
-      "  name: 'innerprod2_innerprod2_0_split' "
-      "  type: 'Split' "
-      "  bottom: 'innerprod2' "
-      "  top: 'innerprod2_innerprod2_0_split_0' "
-      "  top: 'innerprod2_innerprod2_0_split_1' "
-      "} "
-      "layer { "
-      "  name: 'innerprod3' "
-      "  type: 'InnerProduct' "
-      "  bottom: 'data_data_0_split_2' "
-      "  top: 'innerprod3' "
-      "} "
-      "layer { "
-      "  name: 'loss1' "
-      "  type: 'EuclideanLoss' "
-      "  bottom: 'innerprod1' "
-      "  bottom: 'innerprod2_innerprod2_0_split_0' "
-      "} "
-      "layer { "
-      "  name: 'loss2' "
-      "  type: 'EuclideanLoss' "
-      "  bottom: 'innerprod2_innerprod2_0_split_1' "
-      "  bottom: 'innerprod3' "
-      "} ";
-  this->RunInsertionTest(input_proto, expected_output_proto);
-}
-
-TEST_F(SplitLayerInsertionTest, TestInsertionTwoTop) {
-  const string& input_proto =
-      "name: 'TestNetwork' "
-      "layer { "
-      "  name: 'data' "
-      "  type: 'Data' "
-      "  top: 'data' "
-      "  top: 'label' "
-      "} "
-      "layer { "
-      "  name: 'innerprod1' "
-      "  type: 'InnerProduct' "
-      "  bottom: 'data' "
-      "  top: 'innerprod1' "
-      "} "
-      "layer { "
-      "  name: 'innerprod2' "
-      "  type: 'InnerProduct' "
-      "  bottom: 'label' "
-      "  top: 'innerprod2' "
-      "} "
-      "layer { "
-      "  name: 'innerprod3' "
-      "  type: 'InnerProduct' "
-      "  bottom: 'data' "
-      "  top: 'innerprod3' "
-      "} "
-      "layer { "
-      "  name: 'innerprod4' "
-      "  type: 'InnerProduct' "
-      "  bottom: 'label' "
-      "  top: 'innerprod4' "
-      "} "
-      "layer { "
-      "  name: 'loss1' "
-      "  type: 'EuclideanLoss' "
-      "  bottom: 'innerprod1' "
-      "  bottom: 'innerprod3' "
-      "} "
-      "layer { "
-      "  name: 'loss2' "
-      "  type: 'EuclideanLoss' "
-      "  bottom: 'innerprod2' "
-      "  bottom: 'innerprod4' "
-      "} ";
-  const string& expected_output_proto =
-      "name: 'TestNetwork' "
-      "layer { "
-      "  name: 'data' "
-      "  type: 'Data' "
-      "  top: 'data' "
-      "  top: 'label' "
-      "} "
-      "layer { "
-      "  name: 'data_data_0_split' "
-      "  type: 'Split' "
-      "  bottom: 'data' "
-      "  top: 'data_data_0_split_0' "
-      "  top: 'data_data_0_split_1' "
-      "} "
-      "layer { "
-      "  name: 'label_data_1_split' "
-      "  type: 'Split' "
-      "  bottom: 'label' "
-      "  top: 'label_data_1_split_0' "
-      "  top: 'label_data_1_split_1' "
-      "} "
-      "layer { "
-      "  name: 'innerprod1' "
-      "  type: 'InnerProduct' "
-      "  bottom: 'data_data_0_split_0' "
-      "  top: 'innerprod1' "
-      "} "
-      "layer { "
-      "  name: 'innerprod2' "
-      "  type: 'InnerProduct' "
-      "  bottom: 'label_data_1_split_0' "
-      "  top: 'innerprod2' "
-      "} "
-      "layer { "
-      "  name: 'innerprod3' "
-      "  type: 'InnerProduct' "
-      "  bottom: 'data_data_0_split_1' "
-      "  top: 'innerprod3' "
-      "} "
-      "layer { "
-      "  name: 'innerprod4' "
-      "  type: 'InnerProduct' "
-      "  bottom: 'label_data_1_split_1' "
-      "  top: 'innerprod4' "
-      "} "
-      "layer { "
-      "  name: 'loss1' "
-      "  type: 'EuclideanLoss' "
-      "  bottom: 'innerprod1' "
-      "  bottom: 'innerprod3' "
-      "} "
-      "layer { "
-      "  name: 'loss2' "
-      "  type: 'EuclideanLoss' "
-      "  bottom: 'innerprod2' "
-      "  bottom: 'innerprod4' "
-      "} ";
-  this->RunInsertionTest(input_proto, expected_output_proto);
-}
-
-TEST_F(SplitLayerInsertionTest, TestWithInPlace) {
-  const string& input_proto =
-      "name: 'TestNetwork' "
-      "layer { "
-      "  name: 'data' "
-      "  type: 'Data' "
-      "  top: 'data' "
-      "  top: 'label' "
-      "} "
-      "layer { "
-      "  name: 'innerprod1' "
-      "  type: 'InnerProduct' "
-      "  bottom: 'data' "
-      "  top: 'innerprod1' "
-      "} "
-      "layer { "
-      "  name: 'relu1' "
-      "  type: 'ReLU' "
-      "  bottom: 'innerprod1' "
-      "  top: 'innerprod1' "
-      "} "
-      "layer { "
-      "  name: 'innerprod2' "
-      "  type: 'InnerProduct' "
-      "  bottom: 'innerprod1' "
-      "  top: 'innerprod2' "
-      "} "
-      "layer { "
-      "  name: 'loss1' "
-      "  type: 'EuclideanLoss' "
-      "  bottom: 'innerprod1' "
-      "  bottom: 'label' "
-      "} "
-      "layer { "
-      "  name: 'loss2' "
-      "  type: 'EuclideanLoss' "
-      "  bottom: 'innerprod2' "
-      "  bottom: 'data' "
-      "} ";
-  const string& expected_output_proto =
-      "name: 'TestNetwork' "
-      "layer { "
-      "  name: 'data' "
-      "  type: 'Data' "
-      "  top: 'data' "
-      "  top: 'label' "
-      "} "
-      "layer { "
-      "  name: 'data_data_0_split' "
-      "  type: 'Split' "
-      "  bottom: 'data' "
-      "  top: 'data_data_0_split_0' "
-      "  top: 'data_data_0_split_1' "
-      "} "
-      "layer { "
-      "  name: 'innerprod1' "
-      "  type: 'InnerProduct' "
-      "  bottom: 'data_data_0_split_0' "
-      "  top: 'innerprod1' "
-      "} "
-      "layer { "
-      "  name: 'relu1' "
-      "  type: 'ReLU' "
-      "  bottom: 'innerprod1' "
-      "  top: 'innerprod1' "
-      "} "
-      "layer { "
-      "  name: 'innerprod1_relu1_0_split' "
-      "  type: 'Split' "
-      "  bottom: 'innerprod1' "
-      "  top: 'innerprod1_relu1_0_split_0' "
-      "  top: 'innerprod1_relu1_0_split_1' "
-      "} "
-      "layer { "
-      "  name: 'innerprod2' "
-      "  type: 'InnerProduct' "
-      "  bottom: 'innerprod1_relu1_0_split_0' "
-      "  top: 'innerprod2' "
-      "} "
-      "layer { "
-      "  name: 'loss1' "
-      "  type: 'EuclideanLoss' "
-      "  bottom: 'innerprod1_relu1_0_split_1' "
-      "  bottom: 'label' "
-      "} "
-      "layer { "
-      "  name: 'loss2' "
-      "  type: 'EuclideanLoss' "
-      "  bottom: 'innerprod2' "
-      "  bottom: 'data_data_0_split_1' "
-      "} ";
-  this->RunInsertionTest(input_proto, expected_output_proto);
-}
-
-}  // namespace caffe
diff --git a/src/caffe/test/test_spp_layer.cpp b/src/caffe/test/test_spp_layer.cpp
deleted file mode 100644
index 59a3af2..0000000
--- a/src/caffe/test/test_spp_layer.cpp
+++ /dev/null
@@ -1,134 +0,0 @@
-#include <vector>
-
-#include "gtest/gtest.h"
-
-#include "caffe/blob.hpp"
-#include "caffe/common.hpp"
-#include "caffe/filler.hpp"
-#include "caffe/layers/concat_layer.hpp"
-#include "caffe/layers/flatten_layer.hpp"
-#include "caffe/layers/pooling_layer.hpp"
-#include "caffe/layers/split_layer.hpp"
-#include "caffe/layers/spp_layer.hpp"
-
-
-#include "caffe/test/test_caffe_main.hpp"
-#include "caffe/test/test_gradient_check_util.hpp"
-
-namespace caffe {
-
-template <typename TypeParam>
-class SPPLayerTest : public MultiDeviceTest<TypeParam> {
-  typedef typename TypeParam::Dtype Dtype;
-
- protected:
-  SPPLayerTest()
-      : blob_bottom_(new Blob<Dtype>()),
-        blob_bottom_2_(new Blob<Dtype>()),
-        blob_bottom_3_(new Blob<Dtype>()),
-        blob_top_(new Blob<Dtype>()) {}
-  virtual void SetUp() {
-    Caffe::set_random_seed(1701);
-    blob_bottom_->Reshape(2, 3, 9, 8);
-    blob_bottom_2_->Reshape(4, 3, 1024, 765);
-    blob_bottom_3_->Reshape(10, 3, 7, 7);
-    // fill the values
-    FillerParameter filler_param;
-    GaussianFiller<Dtype> filler(filler_param);
-    filler.Fill(this->blob_bottom_);
-    blob_bottom_vec_.push_back(blob_bottom_);
-    blob_bottom_vec_2_.push_back(blob_bottom_2_);
-    blob_bottom_vec_3_.push_back(blob_bottom_3_);
-    blob_top_vec_.push_back(blob_top_);
-  }
-  virtual ~SPPLayerTest() { delete blob_bottom_; delete blob_top_; }
-
-  Blob<Dtype>* const blob_bottom_;
-  Blob<Dtype>* const blob_bottom_2_;
-  Blob<Dtype>* const blob_bottom_3_;
-  Blob<Dtype>* const blob_top_;
-  vector<Blob<Dtype>*> blob_bottom_vec_;
-  vector<Blob<Dtype>*> blob_bottom_vec_2_;
-  vector<Blob<Dtype>*> blob_bottom_vec_3_;
-  vector<Blob<Dtype>*> blob_top_vec_;
-};
-
-TYPED_TEST_CASE(SPPLayerTest, TestDtypesAndDevices);
-
-TYPED_TEST(SPPLayerTest, TestSetup) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  layer_param.mutable_spp_param()->set_pyramid_height(3);
-  SPPLayer<Dtype> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  // expected number of pool results is geometric sum
-  // (1 - r ** n)/(1 - r) where r = 4 and n = pyramid_height
-  // (1 - 4 ** 3)/(1 - 4) = 21
-  // multiply bottom num_channels * expected_pool_results
-  // to get expected num_channels (3 * 21 = 63)
-  EXPECT_EQ(this->blob_top_->num(), 2);
-  EXPECT_EQ(this->blob_top_->channels(), 63);
-  EXPECT_EQ(this->blob_top_->height(), 1);
-  EXPECT_EQ(this->blob_top_->width(), 1);
-}
-
-TYPED_TEST(SPPLayerTest, TestEqualOutputDims) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  layer_param.mutable_spp_param()->set_pyramid_height(5);
-  SPPLayer<Dtype> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_2_, this->blob_top_vec_);
-  // expected number of pool results is geometric sum
-  // (1 - r ** n)/(1 - r) where r = 4 and n = pyramid_height
-  // (1 - 4 ** 5)/(1 - 4) = 341
-  // multiply bottom num_channels * expected_pool_results
-  // to get expected num_channels (3 * 341 = 1023)
-  EXPECT_EQ(this->blob_top_->num(), 4);
-  EXPECT_EQ(this->blob_top_->channels(), 1023);
-  EXPECT_EQ(this->blob_top_->height(), 1);
-  EXPECT_EQ(this->blob_top_->width(), 1);
-}
-
-TYPED_TEST(SPPLayerTest, TestEqualOutputDims2) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  layer_param.mutable_spp_param()->set_pyramid_height(3);
-  SPPLayer<Dtype> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_3_, this->blob_top_vec_);
-  // expected number of pool results is geometric sum
-  // (1 - r ** n)/(1 - r) where r = 4 and n = pyramid_height
-  // (1 - 4 ** 3)/(1 - 4) = 21
-  // multiply bottom num_channels * expected_pool_results
-  // to get expected num_channels (3 * 21 = 63)
-  EXPECT_EQ(this->blob_top_->num(), 10);
-  EXPECT_EQ(this->blob_top_->channels(), 63);
-  EXPECT_EQ(this->blob_top_->height(), 1);
-  EXPECT_EQ(this->blob_top_->width(), 1);
-}
-
-TYPED_TEST(SPPLayerTest, TestForwardBackward) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  layer_param.mutable_spp_param()->set_pyramid_height(3);
-  SPPLayer<Dtype> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  vector<bool> propagate_down(this->blob_bottom_vec_.size(), true);
-  layer.Backward(this->blob_top_vec_, propagate_down,
-                 this->blob_bottom_vec_);
-}
-
-TYPED_TEST(SPPLayerTest, TestGradient) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  SPPParameter* spp_param = layer_param.mutable_spp_param();
-  spp_param->set_pyramid_height(3);
-  SPPLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-4, 1e-2);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_);
-}
-
-
-}  // namespace caffe
diff --git a/src/caffe/test/test_stochastic_pooling.cpp b/src/caffe/test/test_stochastic_pooling.cpp
deleted file mode 100644
index cd5db83..0000000
--- a/src/caffe/test/test_stochastic_pooling.cpp
+++ /dev/null
@@ -1,175 +0,0 @@
-#include <algorithm>
-#include <vector>
-
-#include "gtest/gtest.h"
-
-#include "caffe/blob.hpp"
-#include "caffe/common.hpp"
-#include "caffe/filler.hpp"
-#include "caffe/layers/pooling_layer.hpp"
-
-#include "caffe/test/test_caffe_main.hpp"
-#include "caffe/test/test_gradient_check_util.hpp"
-
-using std::min;
-
-namespace caffe {
-
-template <typename TypeParam>
-class StochasticPoolingLayerTest : public MultiDeviceTest<TypeParam> {
-  typedef typename TypeParam::Dtype Dtype;
-
- protected:
-  StochasticPoolingLayerTest()
-      : blob_bottom_(new Blob<Dtype>()),
-        blob_top_(new Blob<Dtype>()) {}
-  virtual void SetUp() {
-    Caffe::set_random_seed(1701);
-    blob_bottom_->Reshape(2, 3, 6, 5);
-    // fill the values
-    FillerParameter filler_param;
-    filler_param.set_min(0.1);
-    filler_param.set_max(1.);
-    UniformFiller<Dtype> filler(filler_param);
-    filler.Fill(this->blob_bottom_);
-    blob_bottom_vec_.push_back(blob_bottom_);
-    blob_top_vec_.push_back(blob_top_);
-  }
-
-  virtual ~StochasticPoolingLayerTest() {
-    delete blob_bottom_; delete blob_top_;
-  }
-
-  Blob<Dtype>* const blob_bottom_;
-  Blob<Dtype>* const blob_top_;
-  vector<Blob<Dtype>*> blob_bottom_vec_;
-  vector<Blob<Dtype>*> blob_top_vec_;
-};
-
-template <typename Dtype>
-class CPUStochasticPoolingLayerTest
-  : public StochasticPoolingLayerTest<CPUDevice<Dtype> > {
-};
-
-TYPED_TEST_CASE(CPUStochasticPoolingLayerTest, TestDtypes);
-
-TYPED_TEST(CPUStochasticPoolingLayerTest, TestSetup) {
-  LayerParameter layer_param;
-  PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
-  pooling_param->set_kernel_size(3);
-  pooling_param->set_stride(2);
-  PoolingLayer<TypeParam> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  EXPECT_EQ(this->blob_top_->num(), this->blob_bottom_->num());
-  EXPECT_EQ(this->blob_top_->channels(), this->blob_bottom_->channels());
-  EXPECT_EQ(this->blob_top_->height(), 3);
-  EXPECT_EQ(this->blob_top_->width(), 2);
-}
-
-#ifndef CPU_ONLY
-
-template <typename Dtype>
-class GPUStochasticPoolingLayerTest
-  : public StochasticPoolingLayerTest<GPUDevice<Dtype> > {
-};
-
-TYPED_TEST_CASE(GPUStochasticPoolingLayerTest, TestDtypes);
-
-TYPED_TEST(GPUStochasticPoolingLayerTest, TestStochastic) {
-  LayerParameter layer_param;
-  layer_param.set_phase(TRAIN);
-  PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
-  pooling_param->set_kernel_size(3);
-  pooling_param->set_stride(2);
-  pooling_param->set_pool(PoolingParameter_PoolMethod_STOCHASTIC);
-  PoolingLayer<TypeParam> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-
-  // Check if the output is correct - it should do random sampling
-  const TypeParam* bottom_data = this->blob_bottom_->cpu_data();
-  const TypeParam* top_data = this->blob_top_->cpu_data();
-  TypeParam total = 0;
-  for (int n = 0; n < this->blob_top_->num(); ++n) {
-    for (int c = 0; c < this->blob_top_->channels(); ++c) {
-      for (int ph = 0; ph < this->blob_top_->height(); ++ph) {
-        for (int pw = 0; pw < this->blob_top_->width(); ++pw) {
-          TypeParam pooled = top_data[this->blob_top_->offset(n, c, ph, pw)];
-          total += pooled;
-          int hstart = ph * 2;
-          int hend = min(hstart + 3, this->blob_bottom_->height());
-          int wstart = pw * 2;
-          int wend = min(wstart + 3, this->blob_bottom_->width());
-          bool has_equal = false;
-          for (int h = hstart; h < hend; ++h) {
-            for (int w = wstart; w < wend; ++w) {
-              has_equal |= (pooled == bottom_data[this->blob_bottom_->
-                  offset(n, c, h, w)]);
-            }
-          }
-          EXPECT_TRUE(has_equal);
-        }
-      }
-    }
-  }
-  // When we are doing stochastic pooling, the average we get should be higher
-  // than the simple data average since we are weighting more on higher-valued
-  // ones.
-  EXPECT_GE(total / this->blob_top_->count(), 0.55);
-}
-
-TYPED_TEST(GPUStochasticPoolingLayerTest, TestStochasticTestPhase) {
-  LayerParameter layer_param;
-  layer_param.set_phase(TEST);
-  PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
-  pooling_param->set_kernel_size(3);
-  pooling_param->set_stride(2);
-  pooling_param->set_pool(PoolingParameter_PoolMethod_STOCHASTIC);
-  PoolingLayer<TypeParam> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-
-  // Check if the output is correct - it should do random sampling
-  const TypeParam* bottom_data = this->blob_bottom_->cpu_data();
-  const TypeParam* top_data = this->blob_top_->cpu_data();
-  for (int n = 0; n < this->blob_top_->num(); ++n) {
-    for (int c = 0; c < this->blob_top_->channels(); ++c) {
-      for (int ph = 0; ph < this->blob_top_->height(); ++ph) {
-        for (int pw = 0; pw < this->blob_top_->width(); ++pw) {
-          TypeParam pooled = top_data[this->blob_top_->offset(n, c, ph, pw)];
-          int hstart = ph * 2;
-          int hend = min(hstart + 3, this->blob_bottom_->height());
-          int wstart = pw * 2;
-          int wend = min(wstart + 3, this->blob_bottom_->width());
-          bool smaller_than_max = false;
-          for (int h = hstart; h < hend; ++h) {
-            for (int w = wstart; w < wend; ++w) {
-              smaller_than_max |= (pooled <= bottom_data[this->blob_bottom_->
-                  offset(n, c, h, w)]);
-            }
-          }
-          EXPECT_TRUE(smaller_than_max);
-        }
-      }
-    }
-  }
-}
-
-TYPED_TEST(GPUStochasticPoolingLayerTest, TestGradient) {
-  LayerParameter layer_param;
-  layer_param.set_phase(TRAIN);
-  PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
-  pooling_param->set_kernel_size(3);
-  pooling_param->set_stride(2);
-  pooling_param->set_pool(PoolingParameter_PoolMethod_STOCHASTIC);
-  PoolingLayer<TypeParam> layer(layer_param);
-  GradientChecker<TypeParam> checker(1e-4, 1e-2);
-  // it is too expensive to call curand multiple times, so we don't do an
-  // exhaustive gradient check.
-  checker.CheckGradient(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_);
-}
-
-#endif
-
-}  // namespace caffe
diff --git a/src/caffe/test/test_syncedmem.cpp b/src/caffe/test/test_syncedmem.cpp
deleted file mode 100644
index 16dfb58..0000000
--- a/src/caffe/test/test_syncedmem.cpp
+++ /dev/null
@@ -1,125 +0,0 @@
-#include <vector>
-
-#include "gtest/gtest.h"
-
-#include "caffe/common.hpp"
-#include "caffe/syncedmem.hpp"
-#include "caffe/util/device_alternate.hpp"
-#include "caffe/util/math_functions.hpp"
-
-#include "caffe/test/test_caffe_main.hpp"
-
-namespace caffe {
-
-class SyncedMemoryTest : public ::testing::Test {};
-
-TEST_F(SyncedMemoryTest, TestInitialization) {
-  SyncedMemory mem(10);
-  EXPECT_EQ(mem.head(), SyncedMemory::UNINITIALIZED);
-  EXPECT_EQ(mem.size(), 10);
-  SyncedMemory* p_mem = new SyncedMemory(10 * sizeof(float));
-  EXPECT_EQ(p_mem->size(), 10 * sizeof(float));
-  delete p_mem;
-}
-
-#ifndef CPU_ONLY  // GPU test
-
-TEST_F(SyncedMemoryTest, TestAllocationCPUGPU) {
-  SyncedMemory mem(10);
-  EXPECT_TRUE(mem.cpu_data());
-  EXPECT_TRUE(mem.gpu_data());
-  EXPECT_TRUE(mem.mutable_cpu_data());
-  EXPECT_TRUE(mem.mutable_gpu_data());
-}
-
-#endif
-
-TEST_F(SyncedMemoryTest, TestAllocationCPU) {
-  SyncedMemory mem(10);
-  EXPECT_TRUE(mem.cpu_data());
-  EXPECT_TRUE(mem.mutable_cpu_data());
-}
-
-#ifndef CPU_ONLY  // GPU test
-
-TEST_F(SyncedMemoryTest, TestAllocationGPU) {
-  SyncedMemory mem(10);
-  EXPECT_TRUE(mem.gpu_data());
-  EXPECT_TRUE(mem.mutable_gpu_data());
-}
-
-#endif
-
-TEST_F(SyncedMemoryTest, TestCPUWrite) {
-  SyncedMemory mem(10);
-  void* cpu_data = mem.mutable_cpu_data();
-  EXPECT_EQ(mem.head(), SyncedMemory::HEAD_AT_CPU);
-  caffe_memset(mem.size(), 1, cpu_data);
-  for (int i = 0; i < mem.size(); ++i) {
-    EXPECT_EQ((static_cast<char*>(cpu_data))[i], 1);
-  }
-  // do another round
-  cpu_data = mem.mutable_cpu_data();
-  EXPECT_EQ(mem.head(), SyncedMemory::HEAD_AT_CPU);
-  caffe_memset(mem.size(), 2, cpu_data);
-  for (int i = 0; i < mem.size(); ++i) {
-    EXPECT_EQ((static_cast<char*>(cpu_data))[i], 2);
-  }
-}
-
-#ifndef CPU_ONLY  // GPU test
-
-TEST_F(SyncedMemoryTest, TestGPURead) {
-  SyncedMemory mem(10);
-  void* cpu_data = mem.mutable_cpu_data();
-  EXPECT_EQ(mem.head(), SyncedMemory::HEAD_AT_CPU);
-  caffe_memset(mem.size(), 1, cpu_data);
-  const void* gpu_data = mem.gpu_data();
-  EXPECT_EQ(mem.head(), SyncedMemory::SYNCED);
-  // check if values are the same
-  char* recovered_value = new char[10];
-  caffe_gpu_memcpy(10, gpu_data, recovered_value);
-  for (int i = 0; i < mem.size(); ++i) {
-    EXPECT_EQ((static_cast<char*>(recovered_value))[i], 1);
-  }
-  // do another round
-  cpu_data = mem.mutable_cpu_data();
-  EXPECT_EQ(mem.head(), SyncedMemory::HEAD_AT_CPU);
-  caffe_memset(mem.size(), 2, cpu_data);
-  for (int i = 0; i < mem.size(); ++i) {
-    EXPECT_EQ((static_cast<char*>(cpu_data))[i], 2);
-  }
-  gpu_data = mem.gpu_data();
-  EXPECT_EQ(mem.head(), SyncedMemory::SYNCED);
-  // check if values are the same
-  caffe_gpu_memcpy(10, gpu_data, recovered_value);
-  for (int i = 0; i < mem.size(); ++i) {
-    EXPECT_EQ((static_cast<char*>(recovered_value))[i], 2);
-  }
-  delete[] recovered_value;
-}
-
-TEST_F(SyncedMemoryTest, TestGPUWrite) {
-  SyncedMemory mem(10);
-  void* gpu_data = mem.mutable_gpu_data();
-  EXPECT_EQ(mem.head(), SyncedMemory::HEAD_AT_GPU);
-  caffe_gpu_memset(mem.size(), 1, gpu_data);
-  const void* cpu_data = mem.cpu_data();
-  for (int i = 0; i < mem.size(); ++i) {
-    EXPECT_EQ((static_cast<const char*>(cpu_data))[i], 1);
-  }
-  EXPECT_EQ(mem.head(), SyncedMemory::SYNCED);
-
-  gpu_data = mem.mutable_gpu_data();
-  EXPECT_EQ(mem.head(), SyncedMemory::HEAD_AT_GPU);
-  caffe_gpu_memset(mem.size(), 2, gpu_data);
-  cpu_data = mem.cpu_data();
-  for (int i = 0; i < mem.size(); ++i) {
-    EXPECT_EQ((static_cast<const char*>(cpu_data))[i], 2);
-  }
-  EXPECT_EQ(mem.head(), SyncedMemory::SYNCED);
-}
-
-#endif
-
-}  // namespace caffe
diff --git a/src/caffe/test/test_tanh_layer.cpp b/src/caffe/test/test_tanh_layer.cpp
deleted file mode 100644
index bb8699a..0000000
--- a/src/caffe/test/test_tanh_layer.cpp
+++ /dev/null
@@ -1,101 +0,0 @@
-#include <algorithm>
-#include <vector>
-
-#include "gtest/gtest.h"
-
-#include "caffe/blob.hpp"
-#include "caffe/common.hpp"
-#include "caffe/filler.hpp"
-#include "caffe/layers/tanh_layer.hpp"
-
-#include "caffe/test/test_caffe_main.hpp"
-#include "caffe/test/test_gradient_check_util.hpp"
-
-namespace caffe {
-
-double tanh_naive(double x) {
-  if (x < -40) {
-    // avoid negative overflow
-    return -1;
-  } else if (x > 40) {
-    // avoid positive overflow
-    return 1;
-  } else {
-    // exact expression for tanh, which is unstable for large x
-    double exp2x = exp(2 * x);
-    return (exp2x - 1.0) / (exp2x + 1.0);
-  }
-}
-
-template <typename TypeParam>
-class TanHLayerTest : public MultiDeviceTest<TypeParam> {
-  typedef typename TypeParam::Dtype Dtype;
-
- protected:
-  TanHLayerTest()
-      : blob_bottom_(new Blob<Dtype>(2, 3, 4, 5)),
-        blob_top_(new Blob<Dtype>()) {
-    Caffe::set_random_seed(1701);
-    FillerParameter filler_param;
-    blob_bottom_vec_.push_back(blob_bottom_);
-    blob_top_vec_.push_back(blob_top_);
-  }
-  virtual ~TanHLayerTest() { delete blob_bottom_; delete blob_top_; }
-
-  void TestForward(Dtype filler_std) {
-    FillerParameter filler_param;
-    filler_param.set_std(filler_std);
-    GaussianFiller<Dtype> filler(filler_param);
-    filler.Fill(this->blob_bottom_);
-
-    LayerParameter layer_param;
-    TanHLayer<Dtype> layer(layer_param);
-    layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-    layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-    // Now, check values
-    const Dtype* bottom_data = this->blob_bottom_->cpu_data();
-    const Dtype* top_data = this->blob_top_->cpu_data();
-    const Dtype min_precision = 1e-5;
-    for (int i = 0; i < this->blob_bottom_->count(); ++i) {
-      Dtype expected_value = tanh_naive(bottom_data[i]);
-      Dtype precision = std::max(
-        Dtype(std::abs(expected_value * Dtype(1e-4))), min_precision);
-      EXPECT_NEAR(expected_value, top_data[i], precision);
-    }
-  }
-
-  void TestBackward(Dtype filler_std) {
-    FillerParameter filler_param;
-    filler_param.set_std(filler_std);
-    GaussianFiller<Dtype> filler(filler_param);
-    filler.Fill(this->blob_bottom_);
-
-    LayerParameter layer_param;
-    TanHLayer<Dtype> layer(layer_param);
-    GradientChecker<Dtype> checker(1e-2, 1e-2, 1701);
-    checker.CheckGradientEltwise(&layer, this->blob_bottom_vec_,
-        this->blob_top_vec_);
-  }
-
-  Blob<Dtype>* const blob_bottom_;
-  Blob<Dtype>* const blob_top_;
-  vector<Blob<Dtype>*> blob_bottom_vec_;
-  vector<Blob<Dtype>*> blob_top_vec_;
-};
-
-TYPED_TEST_CASE(TanHLayerTest, TestDtypesAndDevices);
-
-TYPED_TEST(TanHLayerTest, TestTanH) {
-  this->TestForward(1.0);
-}
-
-TYPED_TEST(TanHLayerTest, TestTanHOverflow) {
-  // this will fail if tanh overflow is not properly handled
-  this->TestForward(10000.0);
-}
-
-TYPED_TEST(TanHLayerTest, TestTanHGradient) {
-  this->TestBackward(1.0);
-}
-
-}  // namespace caffe
diff --git a/src/caffe/test/test_threshold_layer.cpp b/src/caffe/test/test_threshold_layer.cpp
deleted file mode 100644
index 1e84cc5..0000000
--- a/src/caffe/test/test_threshold_layer.cpp
+++ /dev/null
@@ -1,98 +0,0 @@
-#include <vector>
-
-#include "gtest/gtest.h"
-
-#include "caffe/blob.hpp"
-#include "caffe/common.hpp"
-#include "caffe/filler.hpp"
-#include "caffe/layers/threshold_layer.hpp"
-
-#include "caffe/test/test_caffe_main.hpp"
-
-namespace caffe {
-
-template <typename TypeParam>
-class ThresholdLayerTest : public MultiDeviceTest<TypeParam> {
-  typedef typename TypeParam::Dtype Dtype;
- protected:
-  ThresholdLayerTest()
-      : blob_bottom_(new Blob<Dtype>(2, 3, 6, 5)),
-        blob_top_(new Blob<Dtype>()) {
-    Caffe::set_random_seed(1701);
-    // fill the values
-    FillerParameter filler_param;
-    GaussianFiller<Dtype> filler(filler_param);
-    filler.Fill(this->blob_bottom_);
-    blob_bottom_vec_.push_back(blob_bottom_);
-    blob_top_vec_.push_back(blob_top_);
-  }
-  virtual ~ThresholdLayerTest() { delete blob_bottom_; delete blob_top_; }
-  Blob<Dtype>* const blob_bottom_;
-  Blob<Dtype>* const blob_top_;
-  vector<Blob<Dtype>*> blob_bottom_vec_;
-  vector<Blob<Dtype>*> blob_top_vec_;
-};
-
-TYPED_TEST_CASE(ThresholdLayerTest, TestDtypesAndDevices);
-
-
-TYPED_TEST(ThresholdLayerTest, TestSetup) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  ThresholdLayer<Dtype> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  EXPECT_EQ(this->blob_top_->num(), this->blob_bottom_->num());
-  EXPECT_EQ(this->blob_top_->channels(), this->blob_bottom_->channels());
-  EXPECT_EQ(this->blob_top_->height(), this->blob_bottom_->height());
-  EXPECT_EQ(this->blob_top_->width(), this->blob_bottom_->width());
-}
-
-TYPED_TEST(ThresholdLayerTest, Test) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  ThresholdLayer<Dtype> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  // Now, check values
-  const Dtype* bottom_data = this->blob_bottom_->cpu_data();
-  const Dtype* top_data = this->blob_top_->cpu_data();
-  const Dtype threshold_ = layer_param.threshold_param().threshold();
-  for (int i = 0; i < this->blob_bottom_->count(); ++i) {
-    EXPECT_GE(top_data[i], 0.);
-    EXPECT_LE(top_data[i], 1.);
-    if (top_data[i] == 0) {
-      EXPECT_LE(bottom_data[i], threshold_);
-    }
-    if (top_data[i] == 1) {
-      EXPECT_GT(bottom_data[i], threshold_);
-    }
-  }
-}
-
-TYPED_TEST(ThresholdLayerTest, Test2) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  ThresholdParameter* threshold_param =
-    layer_param.mutable_threshold_param();
-  threshold_param->set_threshold(0.5);
-  ThresholdLayer<Dtype> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  // Now, check values
-  const Dtype* bottom_data = this->blob_bottom_->cpu_data();
-  const Dtype* top_data = this->blob_top_->cpu_data();
-  const Dtype threshold_ = layer_param.threshold_param().threshold();
-  EXPECT_FLOAT_EQ(threshold_, 0.5);
-  for (int i = 0; i < this->blob_bottom_->count(); ++i) {
-    EXPECT_GE(top_data[i], 0.);
-    EXPECT_LE(top_data[i], 1.);
-    if (top_data[i] == 0) {
-      EXPECT_LE(bottom_data[i], threshold_);
-    }
-    if (top_data[i] == 1) {
-      EXPECT_GT(bottom_data[i], threshold_);
-    }
-  }
-}
-
-}  // namespace caffe
diff --git a/src/caffe/test/test_tile_layer.cpp b/src/caffe/test/test_tile_layer.cpp
deleted file mode 100644
index 7ff7552..0000000
--- a/src/caffe/test/test_tile_layer.cpp
+++ /dev/null
@@ -1,161 +0,0 @@
-#include <vector>
-
-#include "gtest/gtest.h"
-
-#include "caffe/blob.hpp"
-#include "caffe/common.hpp"
-#include "caffe/filler.hpp"
-#include "caffe/layers/tile_layer.hpp"
-
-#include "caffe/test/test_caffe_main.hpp"
-#include "caffe/test/test_gradient_check_util.hpp"
-
-namespace caffe {
-
-template <typename TypeParam>
-class TileLayerTest : public MultiDeviceTest<TypeParam> {
-  typedef typename TypeParam::Dtype Dtype;
-
- protected:
-  TileLayerTest()
-      : blob_bottom_(new Blob<Dtype>(2, 3, 4, 5)),
-        blob_top_(new Blob<Dtype>()) {}
-  virtual void SetUp() {
-    blob_bottom_vec_.push_back(blob_bottom_);
-    blob_top_vec_.push_back(blob_top_);
-    FillerParameter filler_param;
-    filler_param.set_mean(0.0);
-    filler_param.set_std(1.0);
-    GaussianFiller<Dtype> filler(filler_param);
-    filler.Fill(blob_bottom_);
-  }
-
-  virtual ~TileLayerTest() {
-    delete blob_bottom_;
-    delete blob_top_;
-  }
-
-  Blob<Dtype>* const blob_bottom_;
-  Blob<Dtype>* const blob_top_;
-  vector<Blob<Dtype>*> blob_bottom_vec_;
-  vector<Blob<Dtype>*> blob_top_vec_;
-};
-
-TYPED_TEST_CASE(TileLayerTest, TestDtypesAndDevices);
-
-TYPED_TEST(TileLayerTest, TestTrivialSetup) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  const int kNumTiles = 1;
-  layer_param.mutable_tile_param()->set_tiles(kNumTiles);
-  for (int i = 0; i < this->blob_bottom_->num_axes(); ++i) {
-    layer_param.mutable_tile_param()->set_axis(i);
-    TileLayer<Dtype> layer(layer_param);
-    layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-    ASSERT_EQ(this->blob_top_->num_axes(), this->blob_bottom_->num_axes());
-    for (int j = 0; j < this->blob_bottom_->num_axes(); ++j) {
-      EXPECT_EQ(this->blob_top_->shape(j), this->blob_bottom_->shape(j));
-    }
-  }
-}
-
-TYPED_TEST(TileLayerTest, TestSetup) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  const int kNumTiles = 3;
-  layer_param.mutable_tile_param()->set_tiles(kNumTiles);
-  for (int i = 0; i < this->blob_bottom_->num_axes(); ++i) {
-    layer_param.mutable_tile_param()->set_axis(i);
-    TileLayer<Dtype> layer(layer_param);
-    layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-    ASSERT_EQ(this->blob_top_->num_axes(), this->blob_bottom_->num_axes());
-    for (int j = 0; j < this->blob_bottom_->num_axes(); ++j) {
-      const int top_dim =
-          ((i == j) ? kNumTiles : 1) * this->blob_bottom_->shape(j);
-      EXPECT_EQ(top_dim, this->blob_top_->shape(j));
-    }
-  }
-}
-
-TYPED_TEST(TileLayerTest, TestForwardNum) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  const int kTileAxis = 0;
-  const int kNumTiles = 3;
-  layer_param.mutable_tile_param()->set_axis(kTileAxis);
-  layer_param.mutable_tile_param()->set_tiles(kNumTiles);
-  TileLayer<Dtype> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  for (int n = 0; n < this->blob_top_->num(); ++n) {
-    for (int c = 0; c < this->blob_top_->channels(); ++c) {
-       for (int h = 0; h < this->blob_top_->height(); ++h) {
-         for (int w = 0; w < this->blob_top_->width(); ++w) {
-           const int bottom_n = n % this->blob_bottom_->num();
-           EXPECT_EQ(this->blob_bottom_->data_at(bottom_n, c, h, w),
-                     this->blob_top_->data_at(n, c, h, w));
-         }
-       }
-    }
-  }
-}
-
-TYPED_TEST(TileLayerTest, TestForwardChannels) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  const int kNumTiles = 3;
-  layer_param.mutable_tile_param()->set_tiles(kNumTiles);
-  TileLayer<Dtype> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  for (int n = 0; n < this->blob_top_->num(); ++n) {
-    for (int c = 0; c < this->blob_top_->channels(); ++c) {
-       for (int h = 0; h < this->blob_top_->height(); ++h) {
-         for (int w = 0; w < this->blob_top_->width(); ++w) {
-           const int bottom_c = c % this->blob_bottom_->channels();
-           EXPECT_EQ(this->blob_bottom_->data_at(n, bottom_c, h, w),
-                     this->blob_top_->data_at(n, c, h, w));
-         }
-       }
-    }
-  }
-}
-
-TYPED_TEST(TileLayerTest, TestTrivialGradient) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  const int kNumTiles = 1;
-  layer_param.mutable_tile_param()->set_tiles(kNumTiles);
-  TileLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-2, 1e-2);
-  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_);
-}
-
-TYPED_TEST(TileLayerTest, TestGradientNum) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  const int kTileAxis = 0;
-  const int kNumTiles = 3;
-  layer_param.mutable_tile_param()->set_axis(kTileAxis);
-  layer_param.mutable_tile_param()->set_tiles(kNumTiles);
-  TileLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-2, 1e-2);
-  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_);
-}
-
-TYPED_TEST(TileLayerTest, TestGradientChannels) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  const int kTileAxis = 1;
-  const int kNumTiles = 3;
-  layer_param.mutable_tile_param()->set_axis(kTileAxis);
-  layer_param.mutable_tile_param()->set_tiles(kNumTiles);
-  TileLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-2, 1e-2);
-  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_);
-}
-
-}  // namespace caffe
diff --git a/src/caffe/test/test_upgrade_proto.cpp b/src/caffe/test/test_upgrade_proto.cpp
deleted file mode 100644
index 9dcc2aa..0000000
--- a/src/caffe/test/test_upgrade_proto.cpp
+++ /dev/null
@@ -1,2989 +0,0 @@
-#include <string>
-#include <vector>
-
-#include "boost/scoped_ptr.hpp"
-#include "google/protobuf/text_format.h"
-#include "gtest/gtest.h"
-
-#include "caffe/blob.hpp"
-#include "caffe/common.hpp"
-#include "caffe/layer.hpp"
-#include "caffe/util/db.hpp"
-#include "caffe/util/io.hpp"
-#include "caffe/util/upgrade_proto.hpp"
-
-#include "caffe/test/test_caffe_main.hpp"
-
-namespace caffe {
-
-class PaddingLayerUpgradeTest : public ::testing::Test {
- protected:
-  void RunPaddingUpgradeTest(
-      const string& input_param_string, const string& output_param_string) {
-    // Test that UpgradeV0PaddingLayers called on the proto specified by
-    // input_param_string results in the proto specified by
-    // output_param_string.
-    NetParameter input_param;
-    CHECK(google::protobuf::TextFormat::ParseFromString(
-        input_param_string, &input_param));
-    NetParameter expected_output_param;
-    CHECK(google::protobuf::TextFormat::ParseFromString(
-        output_param_string, &expected_output_param));
-    NetParameter actual_output_param;
-    UpgradeV0PaddingLayers(input_param, &actual_output_param);
-    EXPECT_EQ(expected_output_param.DebugString(),
-        actual_output_param.DebugString());
-    // Also test idempotence.
-    NetParameter double_pad_upgrade_param;
-    UpgradeV0PaddingLayers(actual_output_param, &double_pad_upgrade_param);
-    EXPECT_EQ(actual_output_param.DebugString(),
-       double_pad_upgrade_param.DebugString());
-  }
-};
-
-TEST_F(PaddingLayerUpgradeTest, TestSimple) {
-  const string& input_proto =
-      "name: 'CaffeNet' "
-      "layers { "
-      "  layer { "
-      "    name: 'data' "
-      "    type: 'data' "
-      "    source: '/home/jiayq/Data/ILSVRC12/train-leveldb' "
-      "    meanfile: '/home/jiayq/Data/ILSVRC12/image_mean.binaryproto' "
-      "    batchsize: 256 "
-      "    cropsize: 227 "
-      "    mirror: true "
-      "  } "
-      "  top: 'data' "
-      "  top: 'label' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'pad1' "
-      "    type: 'padding' "
-      "    pad: 2 "
-      "  } "
-      "  bottom: 'data' "
-      "  top: 'pad1' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'conv1' "
-      "    type: 'conv' "
-      "    num_output: 96 "
-      "    kernelsize: 11 "
-      "    stride: 4 "
-      "    weight_filler { "
-      "      type: 'gaussian' "
-      "      std: 0.01 "
-      "    } "
-      "    bias_filler { "
-      "      type: 'constant' "
-      "      value: 0. "
-      "    } "
-      "    blobs_lr: 1. "
-      "    blobs_lr: 2. "
-      "    weight_decay: 1. "
-      "    weight_decay: 0. "
-      "  } "
-      "  bottom: 'pad1' "
-      "  top: 'conv1' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'fc8' "
-      "    type: 'innerproduct' "
-      "    num_output: 1000 "
-      "    weight_filler { "
-      "      type: 'gaussian' "
-      "      std: 0.01 "
-      "    } "
-      "    bias_filler { "
-      "      type: 'constant' "
-      "      value: 0 "
-      "    } "
-      "    blobs_lr: 1. "
-      "    blobs_lr: 2. "
-      "    weight_decay: 1. "
-      "    weight_decay: 0. "
-      "  } "
-      "  bottom: 'conv1' "
-      "  top: 'fc8' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'loss' "
-      "    type: 'softmax_loss' "
-      "  } "
-      "  bottom: 'fc8' "
-      "  bottom: 'label' "
-      "} ";
-  const string& expected_output_proto =
-      "name: 'CaffeNet' "
-      "layers { "
-      "  layer { "
-      "    name: 'data' "
-      "    type: 'data' "
-      "    source: '/home/jiayq/Data/ILSVRC12/train-leveldb' "
-      "    meanfile: '/home/jiayq/Data/ILSVRC12/image_mean.binaryproto' "
-      "    batchsize: 256 "
-      "    cropsize: 227 "
-      "    mirror: true "
-      "  } "
-      "  top: 'data' "
-      "  top: 'label' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'conv1' "
-      "    type: 'conv' "
-      "    num_output: 96 "
-      "    kernelsize: 11 "
-      "    stride: 4 "
-      "    pad: 2 "
-      "    weight_filler { "
-      "      type: 'gaussian' "
-      "      std: 0.01 "
-      "    } "
-      "    bias_filler { "
-      "      type: 'constant' "
-      "      value: 0. "
-      "    } "
-      "    blobs_lr: 1. "
-      "    blobs_lr: 2. "
-      "    weight_decay: 1. "
-      "    weight_decay: 0. "
-      "  } "
-      "  bottom: 'data' "
-      "  top: 'conv1' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'fc8' "
-      "    type: 'innerproduct' "
-      "    num_output: 1000 "
-      "    weight_filler { "
-      "      type: 'gaussian' "
-      "      std: 0.01 "
-      "    } "
-      "    bias_filler { "
-      "      type: 'constant' "
-      "      value: 0 "
-      "    } "
-      "    blobs_lr: 1. "
-      "    blobs_lr: 2. "
-      "    weight_decay: 1. "
-      "    weight_decay: 0. "
-      "  } "
-      "  bottom: 'conv1' "
-      "  top: 'fc8' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'loss' "
-      "    type: 'softmax_loss' "
-      "  } "
-      "  bottom: 'fc8' "
-      "  bottom: 'label' "
-      "} ";
-  this->RunPaddingUpgradeTest(input_proto, expected_output_proto);
-}
-
-TEST_F(PaddingLayerUpgradeTest, TestTwoTops) {
-  const string& input_proto =
-      "name: 'CaffeNet' "
-      "layers { "
-      "  layer { "
-      "    name: 'data' "
-      "    type: 'data' "
-      "    source: '/home/jiayq/Data/ILSVRC12/train-leveldb' "
-      "    meanfile: '/home/jiayq/Data/ILSVRC12/image_mean.binaryproto' "
-      "    batchsize: 256 "
-      "    cropsize: 227 "
-      "    mirror: true "
-      "  } "
-      "  top: 'data' "
-      "  top: 'label' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'pad1' "
-      "    type: 'padding' "
-      "    pad: 2 "
-      "  } "
-      "  bottom: 'data' "
-      "  top: 'pad1' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'conv1' "
-      "    type: 'conv' "
-      "    num_output: 96 "
-      "    kernelsize: 11 "
-      "    stride: 4 "
-      "    weight_filler { "
-      "      type: 'gaussian' "
-      "      std: 0.01 "
-      "    } "
-      "    bias_filler { "
-      "      type: 'constant' "
-      "      value: 0. "
-      "    } "
-      "    blobs_lr: 1. "
-      "    blobs_lr: 2. "
-      "    weight_decay: 1. "
-      "    weight_decay: 0. "
-      "  } "
-      "  bottom: 'pad1' "
-      "  top: 'conv1' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'fc8' "
-      "    type: 'innerproduct' "
-      "    num_output: 1000 "
-      "    weight_filler { "
-      "      type: 'gaussian' "
-      "      std: 0.01 "
-      "    } "
-      "    bias_filler { "
-      "      type: 'constant' "
-      "      value: 0 "
-      "    } "
-      "    blobs_lr: 1. "
-      "    blobs_lr: 2. "
-      "    weight_decay: 1. "
-      "    weight_decay: 0. "
-      "  } "
-      "  bottom: 'conv1' "
-      "  top: 'fc8' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'conv2' "
-      "    type: 'conv' "
-      "    num_output: 96 "
-      "    kernelsize: 11 "
-      "    stride: 4 "
-      "    weight_filler { "
-      "      type: 'gaussian' "
-      "      std: 0.01 "
-      "    } "
-      "    bias_filler { "
-      "      type: 'constant' "
-      "      value: 0. "
-      "    } "
-      "    blobs_lr: 1. "
-      "    blobs_lr: 2. "
-      "    weight_decay: 1. "
-      "    weight_decay: 0. "
-      "  } "
-      "  bottom: 'pad1' "
-      "  top: 'conv2' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'loss' "
-      "    type: 'softmax_loss' "
-      "  } "
-      "  bottom: 'fc8' "
-      "  bottom: 'label' "
-      "} ";
-  const string& expected_output_proto =
-      "name: 'CaffeNet' "
-      "layers { "
-      "  layer { "
-      "    name: 'data' "
-      "    type: 'data' "
-      "    source: '/home/jiayq/Data/ILSVRC12/train-leveldb' "
-      "    meanfile: '/home/jiayq/Data/ILSVRC12/image_mean.binaryproto' "
-      "    batchsize: 256 "
-      "    cropsize: 227 "
-      "    mirror: true "
-      "  } "
-      "  top: 'data' "
-      "  top: 'label' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'conv1' "
-      "    type: 'conv' "
-      "    num_output: 96 "
-      "    kernelsize: 11 "
-      "    stride: 4 "
-      "    pad: 2 "
-      "    weight_filler { "
-      "      type: 'gaussian' "
-      "      std: 0.01 "
-      "    } "
-      "    bias_filler { "
-      "      type: 'constant' "
-      "      value: 0. "
-      "    } "
-      "    blobs_lr: 1. "
-      "    blobs_lr: 2. "
-      "    weight_decay: 1. "
-      "    weight_decay: 0. "
-      "  } "
-      "  bottom: 'data' "
-      "  top: 'conv1' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'fc8' "
-      "    type: 'innerproduct' "
-      "    num_output: 1000 "
-      "    weight_filler { "
-      "      type: 'gaussian' "
-      "      std: 0.01 "
-      "    } "
-      "    bias_filler { "
-      "      type: 'constant' "
-      "      value: 0 "
-      "    } "
-      "    blobs_lr: 1. "
-      "    blobs_lr: 2. "
-      "    weight_decay: 1. "
-      "    weight_decay: 0. "
-      "  } "
-      "  bottom: 'conv1' "
-      "  top: 'fc8' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'conv2' "
-      "    type: 'conv' "
-      "    num_output: 96 "
-      "    kernelsize: 11 "
-      "    stride: 4 "
-      "    pad: 2 "
-      "    weight_filler { "
-      "      type: 'gaussian' "
-      "      std: 0.01 "
-      "    } "
-      "    bias_filler { "
-      "      type: 'constant' "
-      "      value: 0. "
-      "    } "
-      "    blobs_lr: 1. "
-      "    blobs_lr: 2. "
-      "    weight_decay: 1. "
-      "    weight_decay: 0. "
-      "  } "
-      "  bottom: 'data' "
-      "  top: 'conv2' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'loss' "
-      "    type: 'softmax_loss' "
-      "  } "
-      "  bottom: 'fc8' "
-      "  bottom: 'label' "
-      "} ";
-  this->RunPaddingUpgradeTest(input_proto, expected_output_proto);
-}
-
-TEST_F(PaddingLayerUpgradeTest, TestImageNet) {
-  const string& input_proto =
-      "name: 'CaffeNet' "
-      "layers { "
-      "  layer { "
-      "    name: 'data' "
-      "    type: 'data' "
-      "    source: '/home/jiayq/Data/ILSVRC12/train-leveldb' "
-      "    meanfile: '/home/jiayq/Data/ILSVRC12/image_mean.binaryproto' "
-      "    batchsize: 256 "
-      "    cropsize: 227 "
-      "    mirror: true "
-      "  } "
-      "  top: 'data' "
-      "  top: 'label' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'conv1' "
-      "    type: 'conv' "
-      "    num_output: 96 "
-      "    kernelsize: 11 "
-      "    stride: 4 "
-      "    weight_filler { "
-      "      type: 'gaussian' "
-      "      std: 0.01 "
-      "    } "
-      "    bias_filler { "
-      "      type: 'constant' "
-      "      value: 0. "
-      "    } "
-      "    blobs_lr: 1. "
-      "    blobs_lr: 2. "
-      "    weight_decay: 1. "
-      "    weight_decay: 0. "
-      "  } "
-      "  bottom: 'data' "
-      "  top: 'conv1' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'relu1' "
-      "    type: 'relu' "
-      "  } "
-      "  bottom: 'conv1' "
-      "  top: 'conv1' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'pool1' "
-      "    type: 'pool' "
-      "    pool: MAX "
-      "    kernelsize: 3 "
-      "    stride: 2 "
-      "  } "
-      "  bottom: 'conv1' "
-      "  top: 'pool1' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'norm1' "
-      "    type: 'lrn' "
-      "    local_size: 5 "
-      "    alpha: 0.0001 "
-      "    beta: 0.75 "
-      "  } "
-      "  bottom: 'pool1' "
-      "  top: 'norm1' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'pad2' "
-      "    type: 'padding' "
-      "    pad: 2 "
-      "  } "
-      "  bottom: 'norm1' "
-      "  top: 'pad2' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'conv2' "
-      "    type: 'conv' "
-      "    num_output: 256 "
-      "    group: 2 "
-      "    kernelsize: 5 "
-      "    weight_filler { "
-      "      type: 'gaussian' "
-      "      std: 0.01 "
-      "    } "
-      "    bias_filler { "
-      "      type: 'constant' "
-      "      value: 1. "
-      "    } "
-      "    blobs_lr: 1. "
-      "    blobs_lr: 2. "
-      "    weight_decay: 1. "
-      "    weight_decay: 0. "
-      "  } "
-      "  bottom: 'pad2' "
-      "  top: 'conv2' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'relu2' "
-      "    type: 'relu' "
-      "  } "
-      "  bottom: 'conv2' "
-      "  top: 'conv2' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'pool2' "
-      "    type: 'pool' "
-      "    pool: MAX "
-      "    kernelsize: 3 "
-      "    stride: 2 "
-      "  } "
-      "  bottom: 'conv2' "
-      "  top: 'pool2' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'norm2' "
-      "    type: 'lrn' "
-      "    local_size: 5 "
-      "    alpha: 0.0001 "
-      "    beta: 0.75 "
-      "  } "
-      "  bottom: 'pool2' "
-      "  top: 'norm2' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'pad3' "
-      "    type: 'padding' "
-      "    pad: 1 "
-      "  } "
-      "  bottom: 'norm2' "
-      "  top: 'pad3' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'conv3' "
-      "    type: 'conv' "
-      "    num_output: 384 "
-      "    kernelsize: 3 "
-      "    weight_filler { "
-      "      type: 'gaussian' "
-      "      std: 0.01 "
-      "    } "
-      "    bias_filler { "
-      "      type: 'constant' "
-      "      value: 0. "
-      "    } "
-      "    blobs_lr: 1. "
-      "    blobs_lr: 2. "
-      "    weight_decay: 1. "
-      "    weight_decay: 0. "
-      "  } "
-      "  bottom: 'pad3' "
-      "  top: 'conv3' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'relu3' "
-      "    type: 'relu' "
-      "  } "
-      "  bottom: 'conv3' "
-      "  top: 'conv3' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'pad4' "
-      "    type: 'padding' "
-      "    pad: 1 "
-      "  } "
-      "  bottom: 'conv3' "
-      "  top: 'pad4' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'conv4' "
-      "    type: 'conv' "
-      "    num_output: 384 "
-      "    group: 2 "
-      "    kernelsize: 3 "
-      "    weight_filler { "
-      "      type: 'gaussian' "
-      "      std: 0.01 "
-      "    } "
-      "    bias_filler { "
-      "      type: 'constant' "
-      "      value: 1. "
-      "    } "
-      "    blobs_lr: 1. "
-      "    blobs_lr: 2. "
-      "    weight_decay: 1. "
-      "    weight_decay: 0. "
-      "  } "
-      "  bottom: 'pad4' "
-      "  top: 'conv4' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'relu4' "
-      "    type: 'relu' "
-      "  } "
-      "  bottom: 'conv4' "
-      "  top: 'conv4' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'pad5' "
-      "    type: 'padding' "
-      "    pad: 1 "
-      "  } "
-      "  bottom: 'conv4' "
-      "  top: 'pad5' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'conv5' "
-      "    type: 'conv' "
-      "    num_output: 256 "
-      "    group: 2 "
-      "    kernelsize: 3 "
-      "    weight_filler { "
-      "      type: 'gaussian' "
-      "      std: 0.01 "
-      "    } "
-      "    bias_filler { "
-      "      type: 'constant' "
-      "      value: 1. "
-      "    } "
-      "    blobs_lr: 1. "
-      "    blobs_lr: 2. "
-      "    weight_decay: 1. "
-      "    weight_decay: 0. "
-      "  } "
-      "  bottom: 'pad5' "
-      "  top: 'conv5' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'relu5' "
-      "    type: 'relu' "
-      "  } "
-      "  bottom: 'conv5' "
-      "  top: 'conv5' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'pool5' "
-      "    type: 'pool' "
-      "    kernelsize: 3 "
-      "    pool: MAX "
-      "    stride: 2 "
-      "  } "
-      "  bottom: 'conv5' "
-      "  top: 'pool5' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'fc6' "
-      "    type: 'innerproduct' "
-      "    num_output: 4096 "
-      "    weight_filler { "
-      "      type: 'gaussian' "
-      "      std: 0.005 "
-      "    } "
-      "    bias_filler { "
-      "      type: 'constant' "
-      "      value: 1. "
-      "    } "
-      "    blobs_lr: 1. "
-      "    blobs_lr: 2. "
-      "    weight_decay: 1. "
-      "    weight_decay: 0. "
-      "  } "
-      "  bottom: 'pool5' "
-      "  top: 'fc6' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'relu6' "
-      "    type: 'relu' "
-      "  } "
-      "  bottom: 'fc6' "
-      "  top: 'fc6' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'drop6' "
-      "    type: 'dropout' "
-      "    dropout_ratio: 0.5 "
-      "  } "
-      "  bottom: 'fc6' "
-      "  top: 'fc6' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'fc7' "
-      "    type: 'innerproduct' "
-      "    num_output: 4096 "
-      "    weight_filler { "
-      "      type: 'gaussian' "
-      "      std: 0.005 "
-      "    } "
-      "    bias_filler { "
-      "      type: 'constant' "
-      "      value: 1. "
-      "    } "
-      "    blobs_lr: 1. "
-      "    blobs_lr: 2. "
-      "    weight_decay: 1. "
-      "    weight_decay: 0. "
-      "  } "
-      "  bottom: 'fc6' "
-      "  top: 'fc7' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'relu7' "
-      "    type: 'relu' "
-      "  } "
-      "  bottom: 'fc7' "
-      "  top: 'fc7' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'drop7' "
-      "    type: 'dropout' "
-      "    dropout_ratio: 0.5 "
-      "  } "
-      "  bottom: 'fc7' "
-      "  top: 'fc7' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'fc8' "
-      "    type: 'innerproduct' "
-      "    num_output: 1000 "
-      "    weight_filler { "
-      "      type: 'gaussian' "
-      "      std: 0.01 "
-      "    } "
-      "    bias_filler { "
-      "      type: 'constant' "
-      "      value: 0 "
-      "    } "
-      "    blobs_lr: 1. "
-      "    blobs_lr: 2. "
-      "    weight_decay: 1. "
-      "    weight_decay: 0. "
-      "  } "
-      "  bottom: 'fc7' "
-      "  top: 'fc8' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'loss' "
-      "    type: 'softmax_loss' "
-      "  } "
-      "  bottom: 'fc8' "
-      "  bottom: 'label' "
-      "} ";
-  const string& expected_output_proto =
-      "name: 'CaffeNet' "
-      "layers { "
-      "  layer { "
-      "    name: 'data' "
-      "    type: 'data' "
-      "    source: '/home/jiayq/Data/ILSVRC12/train-leveldb' "
-      "    meanfile: '/home/jiayq/Data/ILSVRC12/image_mean.binaryproto' "
-      "    batchsize: 256 "
-      "    cropsize: 227 "
-      "    mirror: true "
-      "  } "
-      "  top: 'data' "
-      "  top: 'label' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'conv1' "
-      "    type: 'conv' "
-      "    num_output: 96 "
-      "    kernelsize: 11 "
-      "    stride: 4 "
-      "    weight_filler { "
-      "      type: 'gaussian' "
-      "      std: 0.01 "
-      "    } "
-      "    bias_filler { "
-      "      type: 'constant' "
-      "      value: 0. "
-      "    } "
-      "    blobs_lr: 1. "
-      "    blobs_lr: 2. "
-      "    weight_decay: 1. "
-      "    weight_decay: 0. "
-      "  } "
-      "  bottom: 'data' "
-      "  top: 'conv1' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'relu1' "
-      "    type: 'relu' "
-      "  } "
-      "  bottom: 'conv1' "
-      "  top: 'conv1' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'pool1' "
-      "    type: 'pool' "
-      "    pool: MAX "
-      "    kernelsize: 3 "
-      "    stride: 2 "
-      "  } "
-      "  bottom: 'conv1' "
-      "  top: 'pool1' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'norm1' "
-      "    type: 'lrn' "
-      "    local_size: 5 "
-      "    alpha: 0.0001 "
-      "    beta: 0.75 "
-      "  } "
-      "  bottom: 'pool1' "
-      "  top: 'norm1' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'conv2' "
-      "    type: 'conv' "
-      "    num_output: 256 "
-      "    group: 2 "
-      "    kernelsize: 5 "
-      "    pad: 2 "
-      "    weight_filler { "
-      "      type: 'gaussian' "
-      "      std: 0.01 "
-      "    } "
-      "    bias_filler { "
-      "      type: 'constant' "
-      "      value: 1. "
-      "    } "
-      "    blobs_lr: 1. "
-      "    blobs_lr: 2. "
-      "    weight_decay: 1. "
-      "    weight_decay: 0. "
-      "  } "
-      "  bottom: 'norm1' "
-      "  top: 'conv2' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'relu2' "
-      "    type: 'relu' "
-      "  } "
-      "  bottom: 'conv2' "
-      "  top: 'conv2' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'pool2' "
-      "    type: 'pool' "
-      "    pool: MAX "
-      "    kernelsize: 3 "
-      "    stride: 2 "
-      "  } "
-      "  bottom: 'conv2' "
-      "  top: 'pool2' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'norm2' "
-      "    type: 'lrn' "
-      "    local_size: 5 "
-      "    alpha: 0.0001 "
-      "    beta: 0.75 "
-      "  } "
-      "  bottom: 'pool2' "
-      "  top: 'norm2' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'conv3' "
-      "    type: 'conv' "
-      "    num_output: 384 "
-      "    kernelsize: 3 "
-      "    pad: 1 "
-      "    weight_filler { "
-      "      type: 'gaussian' "
-      "      std: 0.01 "
-      "    } "
-      "    bias_filler { "
-      "      type: 'constant' "
-      "      value: 0. "
-      "    } "
-      "    blobs_lr: 1. "
-      "    blobs_lr: 2. "
-      "    weight_decay: 1. "
-      "    weight_decay: 0. "
-      "  } "
-      "  bottom: 'norm2' "
-      "  top: 'conv3' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'relu3' "
-      "    type: 'relu' "
-      "  } "
-      "  bottom: 'conv3' "
-      "  top: 'conv3' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'conv4' "
-      "    type: 'conv' "
-      "    num_output: 384 "
-      "    group: 2 "
-      "    kernelsize: 3 "
-      "    pad: 1 "
-      "    weight_filler { "
-      "      type: 'gaussian' "
-      "      std: 0.01 "
-      "    } "
-      "    bias_filler { "
-      "      type: 'constant' "
-      "      value: 1. "
-      "    } "
-      "    blobs_lr: 1. "
-      "    blobs_lr: 2. "
-      "    weight_decay: 1. "
-      "    weight_decay: 0. "
-      "  } "
-      "  bottom: 'conv3' "
-      "  top: 'conv4' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'relu4' "
-      "    type: 'relu' "
-      "  } "
-      "  bottom: 'conv4' "
-      "  top: 'conv4' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'conv5' "
-      "    type: 'conv' "
-      "    num_output: 256 "
-      "    group: 2 "
-      "    kernelsize: 3 "
-      "    pad: 1 "
-      "    weight_filler { "
-      "      type: 'gaussian' "
-      "      std: 0.01 "
-      "    } "
-      "    bias_filler { "
-      "      type: 'constant' "
-      "      value: 1. "
-      "    } "
-      "    blobs_lr: 1. "
-      "    blobs_lr: 2. "
-      "    weight_decay: 1. "
-      "    weight_decay: 0. "
-      "  } "
-      "  bottom: 'conv4' "
-      "  top: 'conv5' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'relu5' "
-      "    type: 'relu' "
-      "  } "
-      "  bottom: 'conv5' "
-      "  top: 'conv5' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'pool5' "
-      "    type: 'pool' "
-      "    kernelsize: 3 "
-      "    pool: MAX "
-      "    stride: 2 "
-      "  } "
-      "  bottom: 'conv5' "
-      "  top: 'pool5' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'fc6' "
-      "    type: 'innerproduct' "
-      "    num_output: 4096 "
-      "    weight_filler { "
-      "      type: 'gaussian' "
-      "      std: 0.005 "
-      "    } "
-      "    bias_filler { "
-      "      type: 'constant' "
-      "      value: 1. "
-      "    } "
-      "    blobs_lr: 1. "
-      "    blobs_lr: 2. "
-      "    weight_decay: 1. "
-      "    weight_decay: 0. "
-      "  } "
-      "  bottom: 'pool5' "
-      "  top: 'fc6' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'relu6' "
-      "    type: 'relu' "
-      "  } "
-      "  bottom: 'fc6' "
-      "  top: 'fc6' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'drop6' "
-      "    type: 'dropout' "
-      "    dropout_ratio: 0.5 "
-      "  } "
-      "  bottom: 'fc6' "
-      "  top: 'fc6' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'fc7' "
-      "    type: 'innerproduct' "
-      "    num_output: 4096 "
-      "    weight_filler { "
-      "      type: 'gaussian' "
-      "      std: 0.005 "
-      "    } "
-      "    bias_filler { "
-      "      type: 'constant' "
-      "      value: 1. "
-      "    } "
-      "    blobs_lr: 1. "
-      "    blobs_lr: 2. "
-      "    weight_decay: 1. "
-      "    weight_decay: 0. "
-      "  } "
-      "  bottom: 'fc6' "
-      "  top: 'fc7' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'relu7' "
-      "    type: 'relu' "
-      "  } "
-      "  bottom: 'fc7' "
-      "  top: 'fc7' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'drop7' "
-      "    type: 'dropout' "
-      "    dropout_ratio: 0.5 "
-      "  } "
-      "  bottom: 'fc7' "
-      "  top: 'fc7' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'fc8' "
-      "    type: 'innerproduct' "
-      "    num_output: 1000 "
-      "    weight_filler { "
-      "      type: 'gaussian' "
-      "      std: 0.01 "
-      "    } "
-      "    bias_filler { "
-      "      type: 'constant' "
-      "      value: 0 "
-      "    } "
-      "    blobs_lr: 1. "
-      "    blobs_lr: 2. "
-      "    weight_decay: 1. "
-      "    weight_decay: 0. "
-      "  } "
-      "  bottom: 'fc7' "
-      "  top: 'fc8' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'loss' "
-      "    type: 'softmax_loss' "
-      "  } "
-      "  bottom: 'fc8' "
-      "  bottom: 'label' "
-      "} ";
-  this->RunPaddingUpgradeTest(input_proto, expected_output_proto);
-}
-
-class NetUpgradeTest : public ::testing::Test {
- protected:
-  void RunV0UpgradeTest(
-      const string& input_param_string, const string& output_param_string) {
-    // Test that UpgradeV0Net called on the NetParameter proto specified by
-    // input_param_string results in the NetParameter proto specified by
-    // output_param_string.
-    NetParameter input_param;
-    CHECK(google::protobuf::TextFormat::ParseFromString(
-        input_param_string, &input_param));
-    NetParameter expected_output_param;
-    CHECK(google::protobuf::TextFormat::ParseFromString(
-        output_param_string, &expected_output_param));
-    NetParameter actual_output_param;
-    UpgradeV0Net(input_param, &actual_output_param);
-    EXPECT_EQ(expected_output_param.DebugString(),
-        actual_output_param.DebugString());
-  }
-
-  void RunV1UpgradeTest(
-      const string& input_param_string, const string& output_param_string) {
-    // Test that UpgradeV0Net called on the NetParameter proto specified by
-    // input_param_string results in the NetParameter proto specified by
-    // output_param_string.
-    NetParameter input_param;
-    CHECK(google::protobuf::TextFormat::ParseFromString(
-        input_param_string, &input_param));
-    NetParameter expected_output_param;
-    CHECK(google::protobuf::TextFormat::ParseFromString(
-        output_param_string, &expected_output_param));
-    NetParameter actual_output_param;
-    UpgradeV1Net(input_param, &actual_output_param);
-    EXPECT_EQ(expected_output_param.DebugString(),
-        actual_output_param.DebugString());
-  }
-};
-
-TEST_F(NetUpgradeTest, TestSimple) {
-  const string& v0_proto =
-      "name: 'CaffeNet' "
-      "layers { "
-      "  layer { "
-      "    name: 'data' "
-      "    type: 'data' "
-      "    source: '/home/jiayq/Data/ILSVRC12/train-leveldb' "
-      "    meanfile: '/home/jiayq/Data/ILSVRC12/image_mean.binaryproto' "
-      "    batchsize: 256 "
-      "    cropsize: 227 "
-      "    mirror: true "
-      "  } "
-      "  top: 'data' "
-      "  top: 'label' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'pad1' "
-      "    type: 'padding' "
-      "    pad: 2 "
-      "  } "
-      "  bottom: 'data' "
-      "  top: 'pad1' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'conv1' "
-      "    type: 'conv' "
-      "    num_output: 96 "
-      "    kernelsize: 11 "
-      "    stride: 4 "
-      "    weight_filler { "
-      "      type: 'gaussian' "
-      "      std: 0.01 "
-      "    } "
-      "    bias_filler { "
-      "      type: 'constant' "
-      "      value: 0. "
-      "    } "
-      "    blobs_lr: 1. "
-      "    blobs_lr: 2. "
-      "    weight_decay: 1. "
-      "    weight_decay: 0. "
-      "  } "
-      "  bottom: 'pad1' "
-      "  top: 'conv1' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'fc8' "
-      "    type: 'innerproduct' "
-      "    num_output: 1000 "
-      "    weight_filler { "
-      "      type: 'gaussian' "
-      "      std: 0.01 "
-      "    } "
-      "    bias_filler { "
-      "      type: 'constant' "
-      "      value: 0 "
-      "    } "
-      "    blobs_lr: 1. "
-      "    blobs_lr: 2. "
-      "    weight_decay: 1. "
-      "    weight_decay: 0. "
-      "  } "
-      "  bottom: 'conv1' "
-      "  top: 'fc8' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'loss' "
-      "    type: 'softmax_loss' "
-      "  } "
-      "  bottom: 'fc8' "
-      "  bottom: 'label' "
-      "} ";
-  const string& expected_v1_proto =
-      "name: 'CaffeNet' "
-      "layers { "
-      "  name: 'data' "
-      "  type: DATA "
-      "  data_param { "
-      "    source: '/home/jiayq/Data/ILSVRC12/train-leveldb' "
-      "    batch_size: 256 "
-      "  } "
-      "  transform_param { "
-      "    crop_size: 227 "
-      "    mirror: true "
-      "    mean_file: '/home/jiayq/Data/ILSVRC12/image_mean.binaryproto' "
-      "  } "
-      "  top: 'data' "
-      "  top: 'label' "
-      "} "
-      "layers { "
-      "  name: 'conv1' "
-      "  type: CONVOLUTION "
-      "  convolution_param { "
-      "    num_output: 96 "
-      "    kernel_size: 11 "
-      "    stride: 4 "
-      "    pad: 2 "
-      "    weight_filler { "
-      "      type: 'gaussian' "
-      "      std: 0.01 "
-      "    } "
-      "    bias_filler { "
-      "      type: 'constant' "
-      "      value: 0. "
-      "    } "
-      "  } "
-      "  blobs_lr: 1. "
-      "  blobs_lr: 2. "
-      "  weight_decay: 1. "
-      "  weight_decay: 0. "
-      "  bottom: 'data' "
-      "  top: 'conv1' "
-      "} "
-      "layers { "
-      "  name: 'fc8' "
-      "  type: INNER_PRODUCT "
-      "  inner_product_param { "
-      "    num_output: 1000 "
-      "    weight_filler { "
-      "      type: 'gaussian' "
-      "      std: 0.01 "
-      "    } "
-      "    bias_filler { "
-      "      type: 'constant' "
-      "      value: 0 "
-      "    } "
-      "  } "
-      "  blobs_lr: 1. "
-      "  blobs_lr: 2. "
-      "  weight_decay: 1. "
-      "  weight_decay: 0. "
-      "  bottom: 'conv1' "
-      "  top: 'fc8' "
-      "} "
-      "layers { "
-      "  name: 'loss' "
-      "  type: SOFTMAX_LOSS "
-      "  bottom: 'fc8' "
-      "  bottom: 'label' "
-      "} ";
-  this->RunV0UpgradeTest(v0_proto, expected_v1_proto);
-
-  const string& expected_v2_proto =
-      "name: 'CaffeNet' "
-      "layer { "
-      "  name: 'data' "
-      "  type: 'Data' "
-      "  data_param { "
-      "    source: '/home/jiayq/Data/ILSVRC12/train-leveldb' "
-      "    batch_size: 256 "
-      "  } "
-      "  transform_param { "
-      "    crop_size: 227 "
-      "    mirror: true "
-      "    mean_file: '/home/jiayq/Data/ILSVRC12/image_mean.binaryproto' "
-      "  } "
-      "  top: 'data' "
-      "  top: 'label' "
-      "} "
-      "layer { "
-      "  name: 'conv1' "
-      "  type: 'Convolution' "
-      "  convolution_param { "
-      "    num_output: 96 "
-      "    kernel_size: 11 "
-      "    stride: 4 "
-      "    pad: 2 "
-      "    weight_filler { "
-      "      type: 'gaussian' "
-      "      std: 0.01 "
-      "    } "
-      "    bias_filler { "
-      "      type: 'constant' "
-      "      value: 0. "
-      "    } "
-      "  } "
-      "  param { "
-      "    lr_mult: 1 "
-      "    decay_mult: 1 "
-      "  } "
-      "  param { "
-      "    lr_mult: 2 "
-      "    decay_mult: 0 "
-      "  } "
-      "  bottom: 'data' "
-      "  top: 'conv1' "
-      "} "
-      "layer { "
-      "  name: 'fc8' "
-      "  type: 'InnerProduct' "
-      "  inner_product_param { "
-      "    num_output: 1000 "
-      "    weight_filler { "
-      "      type: 'gaussian' "
-      "      std: 0.01 "
-      "    } "
-      "    bias_filler { "
-      "      type: 'constant' "
-      "      value: 0 "
-      "    } "
-      "  } "
-      "  param { "
-      "    lr_mult: 1 "
-      "    decay_mult: 1 "
-      "  } "
-      "  param { "
-      "    lr_mult: 2 "
-      "    decay_mult: 0 "
-      "  } "
-      "  bottom: 'conv1' "
-      "  top: 'fc8' "
-      "} "
-      "layer { "
-      "  name: 'loss' "
-      "  type: 'SoftmaxWithLoss' "
-      "  bottom: 'fc8' "
-      "  bottom: 'label' "
-      "} ";
-  this->RunV1UpgradeTest(expected_v1_proto, expected_v2_proto);
-}
-
-// Test any layer or parameter upgrades not covered by other tests.
-TEST_F(NetUpgradeTest, TestAllParams) {
-  const string& input_proto =
-      "name: 'CaffeNet' "
-      "input: 'input_data' "
-      "input_dim: 64 "
-      "input_dim: 3 "
-      "input_dim: 32 "
-      "input_dim: 32 "
-      "layers { "
-      "  layer { "
-      "    name: 'data' "
-      "    type: 'data' "
-      "    source: '/home/jiayq/Data/ILSVRC12/train-leveldb' "
-      "    meanfile: '/home/jiayq/Data/ILSVRC12/image_mean.binaryproto' "
-      "    batchsize: 256 "
-      "    cropsize: 227 "
-      "    mirror: true "
-      "    scale: 0.25 "
-      "    rand_skip: 73 "
-      "  } "
-      "  top: 'data' "
-      "  top: 'label' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'images' "
-      "    type: 'images' "
-      "    source: '/home/jiayq/Data/ILSVRC12/train-images' "
-      "    meanfile: '/home/jiayq/Data/ILSVRC12/image_mean.binaryproto' "
-      "    batchsize: 256 "
-      "    cropsize: 227 "
-      "    mirror: true "
-      "    scale: 0.25 "
-      "    rand_skip: 73 "
-      "    shuffle_images: true "
-      "    new_height: 40 "
-      "    new_width: 30 "
-      "  } "
-      "  top: 'images_data' "
-      "  top: 'images_label' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'window_data' "
-      "    type: 'window_data' "
-      "    source: '/home/jiayq/Data/ILSVRC12/train-leveldb' "
-      "    meanfile: '/home/jiayq/Data/ILSVRC12/image_mean.binaryproto' "
-      "    batchsize: 256 "
-      "    cropsize: 227 "
-      "    mirror: true "
-      "    det_fg_threshold: 0.25 "
-      "    det_bg_threshold: 0.75 "
-      "    det_fg_fraction: 0.5 "
-      "    det_context_pad: 16 "
-      "    det_crop_mode: 'square' "
-      "  } "
-      "  top: 'window_data' "
-      "  top: 'window_label' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'hdf5data' "
-      "    type: 'hdf5_data' "
-      "    source: '/my/hdf5/data' "
-      "    batchsize: 256 "
-      "  } "
-      "  top: 'hdf5data' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'conv1' "
-      "    type: 'conv' "
-      "    num_output: 96 "
-      "    biasterm: false "
-      "    pad: 4 "
-      "    kernelsize: 11 "
-      "    stride: 4 "
-      "    weight_filler { "
-      "      type: 'gaussian' "
-      "      std: 0.01 "
-      "    } "
-      "    bias_filler { "
-      "      type: 'constant' "
-      "      value: 3. "
-      "    } "
-      "    blobs_lr: 1. "
-      "    blobs_lr: 2. "
-      "    weight_decay: 1. "
-      "    weight_decay: 0. "
-      "  } "
-      "  bottom: 'data' "
-      "  top: 'conv1' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'pool1ave' "
-      "    type: 'pool' "
-      "    pool: AVE "
-      "    kernelsize: 3 "
-      "    stride: 2 "
-      "  } "
-      "  bottom: 'conv1' "
-      "  top: 'pool1ave' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'pool1stoch' "
-      "    type: 'pool' "
-      "    pool: STOCHASTIC "
-      "    kernelsize: 4 "
-      "    stride: 5 "
-      "  } "
-      "  bottom: 'conv1' "
-      "  top: 'pool1stoch' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'concat' "
-      "    type: 'concat' "
-      "    concat_dim: 2 "
-      "  } "
-      "  bottom: 'pool1ave' "
-      "  bottom: 'pool1stoch' "
-      "  top: 'pool1concat' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'norm1' "
-      "    type: 'lrn' "
-      "    local_size: 5 "
-      "    alpha: 0.0001 "
-      "    beta: 0.75 "
-      "  } "
-      "  bottom: 'pool1concat' "
-      "  top: 'norm1' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'fc6' "
-      "    type: 'innerproduct' "
-      "    num_output: 4096 "
-      "    biasterm: false "
-      "    weight_filler { "
-      "      type: 'gaussian' "
-      "      std: 0.005 "
-      "    } "
-      "    bias_filler { "
-      "      type: 'constant' "
-      "      value: 1. "
-      "    } "
-      "    blobs_lr: 1. "
-      "    blobs_lr: 2. "
-      "    weight_decay: 1. "
-      "    weight_decay: 0. "
-      "  } "
-      "  bottom: 'norm1' "
-      "  top: 'fc6' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'relu6' "
-      "    type: 'relu' "
-      "  } "
-      "  bottom: 'fc6' "
-      "  top: 'fc6' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'drop6' "
-      "    type: 'dropout' "
-      "    dropout_ratio: 0.2 "
-      "  } "
-      "  bottom: 'fc6' "
-      "  top: 'fc6' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'loss' "
-      "    type: 'infogain_loss' "
-      "    source: '/my/infogain/matrix' "
-      "  } "
-      "  bottom: 'fc6' "
-      "  bottom: 'label' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'accuracy' "
-      "    type: 'accuracy' "
-      "  } "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'bnll' "
-      "    type: 'bnll' "
-      "  } "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'euclidean_loss' "
-      "    type: 'euclidean_loss' "
-      "  } "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'flatten' "
-      "    type: 'flatten' "
-      "  } "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'hdf5_output' "
-      "    type: 'hdf5_output' "
-      "    hdf5_output_param { "
-      "      file_name: '/my/hdf5/output/file' "
-      "    } "
-      "  } "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'im2col' "
-      "    type: 'im2col' "
-      "  } "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'images' "
-      "    type: 'images' "
-      "  } "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'multinomial_logistic_loss' "
-      "    type: 'multinomial_logistic_loss' "
-      "  } "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'sigmoid' "
-      "    type: 'sigmoid' "
-      "  } "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'softmax' "
-      "    type: 'softmax' "
-      "  } "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'split' "
-      "    type: 'split' "
-      "  } "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'tanh' "
-      "    type: 'tanh' "
-      "  } "
-      "} ";
-  const string& expected_output_proto =
-      "name: 'CaffeNet' "
-      "input: 'input_data' "
-      "input_dim: 64 "
-      "input_dim: 3 "
-      "input_dim: 32 "
-      "input_dim: 32 "
-      "layers { "
-      "  name: 'data' "
-      "  type: DATA "
-      "  data_param { "
-      "    source: '/home/jiayq/Data/ILSVRC12/train-leveldb' "
-      "    batch_size: 256 "
-      "    rand_skip: 73 "
-      "  } "
-      "  transform_param { "
-      "    crop_size: 227 "
-      "    mirror: true "
-      "    scale: 0.25 "
-      "    mean_file: '/home/jiayq/Data/ILSVRC12/image_mean.binaryproto' "
-      "  } "
-      "  top: 'data' "
-      "  top: 'label' "
-      "} "
-      "layers { "
-      "  name: 'images' "
-      "  type: IMAGE_DATA "
-      "  image_data_param { "
-      "    source: '/home/jiayq/Data/ILSVRC12/train-images' "
-      "    batch_size: 256 "
-      "    rand_skip: 73 "
-      "    shuffle: true "
-      "    new_height: 40 "
-      "    new_width: 30 "
-      "  } "
-      "  transform_param {"
-      "    mean_file: '/home/jiayq/Data/ILSVRC12/image_mean.binaryproto' "
-      "    crop_size: 227 "
-      "    mirror: true "
-      "    scale: 0.25 "
-      "  } "
-      "  top: 'images_data' "
-      "  top: 'images_label' "
-      "} "
-      "layers { "
-      "  name: 'window_data' "
-      "  type: WINDOW_DATA "
-      "  window_data_param { "
-      "    source: '/home/jiayq/Data/ILSVRC12/train-leveldb' "
-      "    batch_size: 256 "
-      "    fg_threshold: 0.25 "
-      "    bg_threshold: 0.75 "
-      "    fg_fraction: 0.5 "
-      "    context_pad: 16 "
-      "    crop_mode: 'square' "
-      "  } "
-      "  transform_param { "
-      "    mirror: true "
-      "    crop_size: 227 "
-      "    mean_file: '/home/jiayq/Data/ILSVRC12/image_mean.binaryproto' "
-      "  }"
-      "  top: 'window_data' "
-      "  top: 'window_label' "
-      "} "
-      "layers { "
-      "  name: 'hdf5data' "
-      "  type: HDF5_DATA "
-      "  hdf5_data_param { "
-      "    source: '/my/hdf5/data' "
-      "    batch_size: 256 "
-      "  } "
-      "  top: 'hdf5data' "
-      "} "
-      "layers { "
-      "  name: 'conv1' "
-      "  type: CONVOLUTION "
-      "  convolution_param { "
-      "    num_output: 96 "
-      "    bias_term: false "
-      "    pad: 4 "
-      "    kernel_size: 11 "
-      "    stride: 4 "
-      "    weight_filler { "
-      "      type: 'gaussian' "
-      "      std: 0.01 "
-      "    } "
-      "    bias_filler { "
-      "      type: 'constant' "
-      "      value: 3. "
-      "    } "
-      "  } "
-      "  blobs_lr: 1. "
-      "  blobs_lr: 2. "
-      "  weight_decay: 1. "
-      "  weight_decay: 0. "
-      "  bottom: 'data' "
-      "  top: 'conv1' "
-      "} "
-      "layers { "
-      "  name: 'pool1ave' "
-      "  type: POOLING "
-      "  pooling_param { "
-      "    pool: AVE "
-      "    kernel_size: 3 "
-      "    stride: 2 "
-      "  } "
-      "  bottom: 'conv1' "
-      "  top: 'pool1ave' "
-      "} "
-      "layers { "
-      "  name: 'pool1stoch' "
-      "  type: POOLING "
-      "  pooling_param { "
-      "    pool: STOCHASTIC "
-      "    kernel_size: 4 "
-      "    stride: 5 "
-      "  } "
-      "  bottom: 'conv1' "
-      "  top: 'pool1stoch' "
-      "} "
-      "layers { "
-      "  name: 'concat' "
-      "  type: CONCAT "
-      "  concat_param { "
-      "    concat_dim: 2 "
-      "  } "
-      "  bottom: 'pool1ave' "
-      "  bottom: 'pool1stoch' "
-      "  top: 'pool1concat' "
-      "} "
-      "layers { "
-      "  name: 'norm1' "
-      "  type: LRN "
-      "  lrn_param { "
-      "    local_size: 5 "
-      "    alpha: 0.0001 "
-      "    beta: 0.75 "
-      "  } "
-      "  bottom: 'pool1concat' "
-      "  top: 'norm1' "
-      "} "
-      "layers { "
-      "  name: 'fc6' "
-      "  type: INNER_PRODUCT "
-      "  inner_product_param { "
-      "    num_output: 4096 "
-      "    bias_term: false "
-      "    weight_filler { "
-      "      type: 'gaussian' "
-      "      std: 0.005 "
-      "    } "
-      "    bias_filler { "
-      "      type: 'constant' "
-      "      value: 1. "
-      "    } "
-      "  } "
-      "  blobs_lr: 1. "
-      "  blobs_lr: 2. "
-      "  weight_decay: 1. "
-      "  weight_decay: 0. "
-      "  bottom: 'norm1' "
-      "  top: 'fc6' "
-      "} "
-      "layers { "
-      "  name: 'relu6' "
-      "  type: RELU "
-      "  bottom: 'fc6' "
-      "  top: 'fc6' "
-      "} "
-      "layers { "
-      "  name: 'drop6' "
-      "  type: DROPOUT "
-      "  dropout_param { "
-      "    dropout_ratio: 0.2 "
-      "  } "
-      "  bottom: 'fc6' "
-      "  top: 'fc6' "
-      "} "
-      "layers { "
-      "  name: 'loss' "
-      "  type: INFOGAIN_LOSS "
-      "  infogain_loss_param { "
-      "    source: '/my/infogain/matrix' "
-      "  } "
-      "  bottom: 'fc6' "
-      "  bottom: 'label' "
-      "} "
-      "layers { "
-      "  name: 'accuracy' "
-      "  type: ACCURACY "
-      "} "
-      "layers { "
-      "  name: 'bnll' "
-      "  type: BNLL "
-      "} "
-      "layers { "
-      "  name: 'euclidean_loss' "
-      "  type: EUCLIDEAN_LOSS "
-      "} "
-      "layers { "
-      "  name: 'flatten' "
-      "  type: FLATTEN "
-      "} "
-      "layers { "
-      "  name: 'hdf5_output' "
-      "  type: HDF5_OUTPUT "
-      "  hdf5_output_param { "
-      "    file_name: '/my/hdf5/output/file' "
-      "  } "
-      "} "
-      "layers { "
-      "  name: 'im2col' "
-      "  type: IM2COL "
-      "} "
-      "layers { "
-      "  name: 'images' "
-      "  type: IMAGE_DATA "
-      "} "
-      "layers { "
-      "  name: 'multinomial_logistic_loss' "
-      "  type: MULTINOMIAL_LOGISTIC_LOSS "
-      "} "
-      "layers { "
-      "  name: 'sigmoid' "
-      "  type: SIGMOID "
-      "} "
-      "layers { "
-      "  name: 'softmax' "
-      "  type: SOFTMAX "
-      "} "
-      "layers { "
-      "  name: 'split' "
-      "  type: SPLIT "
-      "} "
-      "layers { "
-      "  name: 'tanh' "
-      "  type: TANH "
-      "} ";
-  this->RunV0UpgradeTest(input_proto, expected_output_proto);
-}
-
-TEST_F(NetUpgradeTest, TestImageNet) {
-  const string& v0_proto =
-      "name: 'CaffeNet' "
-      "layers { "
-      "  layer { "
-      "    name: 'data' "
-      "    type: 'data' "
-      "    source: '/home/jiayq/Data/ILSVRC12/train-leveldb' "
-      "    meanfile: '/home/jiayq/Data/ILSVRC12/image_mean.binaryproto' "
-      "    batchsize: 256 "
-      "    cropsize: 227 "
-      "    mirror: true "
-      "  } "
-      "  top: 'data' "
-      "  top: 'label' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'conv1' "
-      "    type: 'conv' "
-      "    num_output: 96 "
-      "    kernelsize: 11 "
-      "    stride: 4 "
-      "    weight_filler { "
-      "      type: 'gaussian' "
-      "      std: 0.01 "
-      "    } "
-      "    bias_filler { "
-      "      type: 'constant' "
-      "      value: 0. "
-      "    } "
-      "    blobs_lr: 1. "
-      "    blobs_lr: 2. "
-      "    weight_decay: 1. "
-      "    weight_decay: 0. "
-      "  } "
-      "  bottom: 'data' "
-      "  top: 'conv1' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'relu1' "
-      "    type: 'relu' "
-      "  } "
-      "  bottom: 'conv1' "
-      "  top: 'conv1' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'pool1' "
-      "    type: 'pool' "
-      "    pool: MAX "
-      "    kernelsize: 3 "
-      "    stride: 2 "
-      "  } "
-      "  bottom: 'conv1' "
-      "  top: 'pool1' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'norm1' "
-      "    type: 'lrn' "
-      "    local_size: 5 "
-      "    alpha: 0.0001 "
-      "    beta: 0.75 "
-      "  } "
-      "  bottom: 'pool1' "
-      "  top: 'norm1' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'pad2' "
-      "    type: 'padding' "
-      "    pad: 2 "
-      "  } "
-      "  bottom: 'norm1' "
-      "  top: 'pad2' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'conv2' "
-      "    type: 'conv' "
-      "    num_output: 256 "
-      "    group: 2 "
-      "    kernelsize: 5 "
-      "    weight_filler { "
-      "      type: 'gaussian' "
-      "      std: 0.01 "
-      "    } "
-      "    bias_filler { "
-      "      type: 'constant' "
-      "      value: 1. "
-      "    } "
-      "    blobs_lr: 1. "
-      "    blobs_lr: 2. "
-      "    weight_decay: 1. "
-      "    weight_decay: 0. "
-      "  } "
-      "  bottom: 'pad2' "
-      "  top: 'conv2' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'relu2' "
-      "    type: 'relu' "
-      "  } "
-      "  bottom: 'conv2' "
-      "  top: 'conv2' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'pool2' "
-      "    type: 'pool' "
-      "    pool: MAX "
-      "    kernelsize: 3 "
-      "    stride: 2 "
-      "  } "
-      "  bottom: 'conv2' "
-      "  top: 'pool2' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'norm2' "
-      "    type: 'lrn' "
-      "    local_size: 5 "
-      "    alpha: 0.0001 "
-      "    beta: 0.75 "
-      "  } "
-      "  bottom: 'pool2' "
-      "  top: 'norm2' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'pad3' "
-      "    type: 'padding' "
-      "    pad: 1 "
-      "  } "
-      "  bottom: 'norm2' "
-      "  top: 'pad3' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'conv3' "
-      "    type: 'conv' "
-      "    num_output: 384 "
-      "    kernelsize: 3 "
-      "    weight_filler { "
-      "      type: 'gaussian' "
-      "      std: 0.01 "
-      "    } "
-      "    bias_filler { "
-      "      type: 'constant' "
-      "      value: 0. "
-      "    } "
-      "    blobs_lr: 1. "
-      "    blobs_lr: 2. "
-      "    weight_decay: 1. "
-      "    weight_decay: 0. "
-      "  } "
-      "  bottom: 'pad3' "
-      "  top: 'conv3' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'relu3' "
-      "    type: 'relu' "
-      "  } "
-      "  bottom: 'conv3' "
-      "  top: 'conv3' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'pad4' "
-      "    type: 'padding' "
-      "    pad: 1 "
-      "  } "
-      "  bottom: 'conv3' "
-      "  top: 'pad4' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'conv4' "
-      "    type: 'conv' "
-      "    num_output: 384 "
-      "    group: 2 "
-      "    kernelsize: 3 "
-      "    weight_filler { "
-      "      type: 'gaussian' "
-      "      std: 0.01 "
-      "    } "
-      "    bias_filler { "
-      "      type: 'constant' "
-      "      value: 1. "
-      "    } "
-      "    blobs_lr: 1. "
-      "    blobs_lr: 2. "
-      "    weight_decay: 1. "
-      "    weight_decay: 0. "
-      "  } "
-      "  bottom: 'pad4' "
-      "  top: 'conv4' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'relu4' "
-      "    type: 'relu' "
-      "  } "
-      "  bottom: 'conv4' "
-      "  top: 'conv4' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'pad5' "
-      "    type: 'padding' "
-      "    pad: 1 "
-      "  } "
-      "  bottom: 'conv4' "
-      "  top: 'pad5' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'conv5' "
-      "    type: 'conv' "
-      "    num_output: 256 "
-      "    group: 2 "
-      "    kernelsize: 3 "
-      "    weight_filler { "
-      "      type: 'gaussian' "
-      "      std: 0.01 "
-      "    } "
-      "    bias_filler { "
-      "      type: 'constant' "
-      "      value: 1. "
-      "    } "
-      "    blobs_lr: 1. "
-      "    blobs_lr: 2. "
-      "    weight_decay: 1. "
-      "    weight_decay: 0. "
-      "  } "
-      "  bottom: 'pad5' "
-      "  top: 'conv5' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'relu5' "
-      "    type: 'relu' "
-      "  } "
-      "  bottom: 'conv5' "
-      "  top: 'conv5' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'pool5' "
-      "    type: 'pool' "
-      "    kernelsize: 3 "
-      "    pool: MAX "
-      "    stride: 2 "
-      "  } "
-      "  bottom: 'conv5' "
-      "  top: 'pool5' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'fc6' "
-      "    type: 'innerproduct' "
-      "    num_output: 4096 "
-      "    weight_filler { "
-      "      type: 'gaussian' "
-      "      std: 0.005 "
-      "    } "
-      "    bias_filler { "
-      "      type: 'constant' "
-      "      value: 1. "
-      "    } "
-      "    blobs_lr: 1. "
-      "    blobs_lr: 2. "
-      "    weight_decay: 1. "
-      "    weight_decay: 0. "
-      "  } "
-      "  bottom: 'pool5' "
-      "  top: 'fc6' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'relu6' "
-      "    type: 'relu' "
-      "  } "
-      "  bottom: 'fc6' "
-      "  top: 'fc6' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'drop6' "
-      "    type: 'dropout' "
-      "    dropout_ratio: 0.5 "
-      "  } "
-      "  bottom: 'fc6' "
-      "  top: 'fc6' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'fc7' "
-      "    type: 'innerproduct' "
-      "    num_output: 4096 "
-      "    weight_filler { "
-      "      type: 'gaussian' "
-      "      std: 0.005 "
-      "    } "
-      "    bias_filler { "
-      "      type: 'constant' "
-      "      value: 1. "
-      "    } "
-      "    blobs_lr: 1. "
-      "    blobs_lr: 2. "
-      "    weight_decay: 1. "
-      "    weight_decay: 0. "
-      "  } "
-      "  bottom: 'fc6' "
-      "  top: 'fc7' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'relu7' "
-      "    type: 'relu' "
-      "  } "
-      "  bottom: 'fc7' "
-      "  top: 'fc7' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'drop7' "
-      "    type: 'dropout' "
-      "    dropout_ratio: 0.5 "
-      "  } "
-      "  bottom: 'fc7' "
-      "  top: 'fc7' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'fc8' "
-      "    type: 'innerproduct' "
-      "    num_output: 1000 "
-      "    weight_filler { "
-      "      type: 'gaussian' "
-      "      std: 0.01 "
-      "    } "
-      "    bias_filler { "
-      "      type: 'constant' "
-      "      value: 0 "
-      "    } "
-      "    blobs_lr: 1. "
-      "    blobs_lr: 2. "
-      "    weight_decay: 1. "
-      "    weight_decay: 0. "
-      "  } "
-      "  bottom: 'fc7' "
-      "  top: 'fc8' "
-      "} "
-      "layers { "
-      "  layer { "
-      "    name: 'loss' "
-      "    type: 'softmax_loss' "
-      "  } "
-      "  bottom: 'fc8' "
-      "  bottom: 'label' "
-      "} ";
-  const string& expected_v1_proto =
-      "name: 'CaffeNet' "
-      "layers { "
-      "  name: 'data' "
-      "  type: DATA "
-      "  data_param { "
-      "    source: '/home/jiayq/Data/ILSVRC12/train-leveldb' "
-      "    batch_size: 256 "
-      "  } "
-      "  transform_param { "
-      "    crop_size: 227 "
-      "    mirror: true "
-      "    mean_file: '/home/jiayq/Data/ILSVRC12/image_mean.binaryproto' "
-      "  } "
-      "  top: 'data' "
-      "  top: 'label' "
-      "} "
-      "layers { "
-      "  name: 'conv1' "
-      "  type: CONVOLUTION "
-      "  convolution_param { "
-      "    num_output: 96 "
-      "    kernel_size: 11 "
-      "    stride: 4 "
-      "    weight_filler { "
-      "      type: 'gaussian' "
-      "      std: 0.01 "
-      "    } "
-      "    bias_filler { "
-      "      type: 'constant' "
-      "      value: 0. "
-      "    } "
-      "  } "
-      "  blobs_lr: 1. "
-      "  blobs_lr: 2. "
-      "  weight_decay: 1. "
-      "  weight_decay: 0. "
-      "  bottom: 'data' "
-      "  top: 'conv1' "
-      "} "
-      "layers { "
-      "  name: 'relu1' "
-      "  type: RELU "
-      "  bottom: 'conv1' "
-      "  top: 'conv1' "
-      "} "
-      "layers { "
-      "  name: 'pool1' "
-      "  type: POOLING "
-      "  pooling_param { "
-      "    pool: MAX "
-      "    kernel_size: 3 "
-      "    stride: 2 "
-      "  } "
-      "  bottom: 'conv1' "
-      "  top: 'pool1' "
-      "} "
-      "layers { "
-      "  name: 'norm1' "
-      "  type: LRN "
-      "  lrn_param { "
-      "    local_size: 5 "
-      "    alpha: 0.0001 "
-      "    beta: 0.75 "
-      "  } "
-      "  bottom: 'pool1' "
-      "  top: 'norm1' "
-      "} "
-      "layers { "
-      "  name: 'conv2' "
-      "  type: CONVOLUTION "
-      "  convolution_param { "
-      "    num_output: 256 "
-      "    group: 2 "
-      "    kernel_size: 5 "
-      "    pad: 2 "
-      "    weight_filler { "
-      "      type: 'gaussian' "
-      "      std: 0.01 "
-      "    } "
-      "    bias_filler { "
-      "      type: 'constant' "
-      "      value: 1. "
-      "    } "
-      "  } "
-      "  blobs_lr: 1. "
-      "  blobs_lr: 2. "
-      "  weight_decay: 1. "
-      "  weight_decay: 0. "
-      "  bottom: 'norm1' "
-      "  top: 'conv2' "
-      "} "
-      "layers { "
-      "  name: 'relu2' "
-      "  type: RELU "
-      "  bottom: 'conv2' "
-      "  top: 'conv2' "
-      "} "
-      "layers { "
-      "  name: 'pool2' "
-      "  type: POOLING "
-      "  pooling_param { "
-      "    pool: MAX "
-      "    kernel_size: 3 "
-      "    stride: 2 "
-      "  } "
-      "  bottom: 'conv2' "
-      "  top: 'pool2' "
-      "} "
-      "layers { "
-      "  name: 'norm2' "
-      "  type: LRN "
-      "  lrn_param { "
-      "    local_size: 5 "
-      "    alpha: 0.0001 "
-      "    beta: 0.75 "
-      "  } "
-      "  bottom: 'pool2' "
-      "  top: 'norm2' "
-      "} "
-      "layers { "
-      "  name: 'conv3' "
-      "  type: CONVOLUTION "
-      "  convolution_param { "
-      "    num_output: 384 "
-      "    kernel_size: 3 "
-      "    pad: 1 "
-      "    weight_filler { "
-      "      type: 'gaussian' "
-      "      std: 0.01 "
-      "    } "
-      "    bias_filler { "
-      "      type: 'constant' "
-      "      value: 0. "
-      "    } "
-      "  } "
-      "  blobs_lr: 1. "
-      "  blobs_lr: 2. "
-      "  weight_decay: 1. "
-      "  weight_decay: 0. "
-      "  bottom: 'norm2' "
-      "  top: 'conv3' "
-      "} "
-      "layers { "
-      "  name: 'relu3' "
-      "  type: RELU "
-      "  bottom: 'conv3' "
-      "  top: 'conv3' "
-      "} "
-      "layers { "
-      "  name: 'conv4' "
-      "  type: CONVOLUTION "
-      "  convolution_param { "
-      "    num_output: 384 "
-      "    group: 2 "
-      "    kernel_size: 3 "
-      "    pad: 1 "
-      "    weight_filler { "
-      "      type: 'gaussian' "
-      "      std: 0.01 "
-      "    } "
-      "    bias_filler { "
-      "      type: 'constant' "
-      "      value: 1. "
-      "    } "
-      "  } "
-      "  blobs_lr: 1. "
-      "  blobs_lr: 2. "
-      "  weight_decay: 1. "
-      "  weight_decay: 0. "
-      "  bottom: 'conv3' "
-      "  top: 'conv4' "
-      "} "
-      "layers { "
-      "  name: 'relu4' "
-      "  type: RELU "
-      "  bottom: 'conv4' "
-      "  top: 'conv4' "
-      "} "
-      "layers { "
-      "  name: 'conv5' "
-      "  type: CONVOLUTION "
-      "  convolution_param { "
-      "    num_output: 256 "
-      "    group: 2 "
-      "    kernel_size: 3 "
-      "    pad: 1 "
-      "    weight_filler { "
-      "      type: 'gaussian' "
-      "      std: 0.01 "
-      "    } "
-      "    bias_filler { "
-      "      type: 'constant' "
-      "      value: 1. "
-      "    } "
-      "  } "
-      "  blobs_lr: 1. "
-      "  blobs_lr: 2. "
-      "  weight_decay: 1. "
-      "  weight_decay: 0. "
-      "  bottom: 'conv4' "
-      "  top: 'conv5' "
-      "} "
-      "layers { "
-      "  name: 'relu5' "
-      "  type: RELU "
-      "  bottom: 'conv5' "
-      "  top: 'conv5' "
-      "} "
-      "layers { "
-      "  name: 'pool5' "
-      "  type: POOLING "
-      "  pooling_param { "
-      "    kernel_size: 3 "
-      "    pool: MAX "
-      "    stride: 2 "
-      "  } "
-      "  bottom: 'conv5' "
-      "  top: 'pool5' "
-      "} "
-      "layers { "
-      "  name: 'fc6' "
-      "  type: INNER_PRODUCT "
-      "  inner_product_param { "
-      "    num_output: 4096 "
-      "    weight_filler { "
-      "      type: 'gaussian' "
-      "      std: 0.005 "
-      "    } "
-      "    bias_filler { "
-      "      type: 'constant' "
-      "      value: 1. "
-      "    } "
-      "  } "
-      "  blobs_lr: 1. "
-      "  blobs_lr: 2. "
-      "  weight_decay: 1. "
-      "  weight_decay: 0. "
-      "  bottom: 'pool5' "
-      "  top: 'fc6' "
-      "} "
-      "layers { "
-      "  name: 'relu6' "
-      "  type: RELU "
-      "  bottom: 'fc6' "
-      "  top: 'fc6' "
-      "} "
-      "layers { "
-      "  name: 'drop6' "
-      "  type: DROPOUT "
-      "  dropout_param { "
-      "    dropout_ratio: 0.5 "
-      "  } "
-      "  bottom: 'fc6' "
-      "  top: 'fc6' "
-      "} "
-      "layers { "
-      "  name: 'fc7' "
-      "  type: INNER_PRODUCT "
-      "  inner_product_param { "
-      "    num_output: 4096 "
-      "    weight_filler { "
-      "      type: 'gaussian' "
-      "      std: 0.005 "
-      "    } "
-      "    bias_filler { "
-      "      type: 'constant' "
-      "      value: 1. "
-      "    } "
-      "  } "
-      "  blobs_lr: 1. "
-      "  blobs_lr: 2. "
-      "  weight_decay: 1. "
-      "  weight_decay: 0. "
-      "  bottom: 'fc6' "
-      "  top: 'fc7' "
-      "} "
-      "layers { "
-      "  name: 'relu7' "
-      "  type: RELU "
-      "  bottom: 'fc7' "
-      "  top: 'fc7' "
-      "} "
-      "layers { "
-      "  name: 'drop7' "
-      "  type: DROPOUT "
-      "  dropout_param { "
-      "    dropout_ratio: 0.5 "
-      "  } "
-      "  bottom: 'fc7' "
-      "  top: 'fc7' "
-      "} "
-      "layers { "
-      "  name: 'fc8' "
-      "  type: INNER_PRODUCT "
-      "  inner_product_param { "
-      "    num_output: 1000 "
-      "    weight_filler { "
-      "      type: 'gaussian' "
-      "      std: 0.01 "
-      "    } "
-      "    bias_filler { "
-      "      type: 'constant' "
-      "      value: 0 "
-      "    } "
-      "  } "
-      "  blobs_lr: 1. "
-      "  blobs_lr: 2. "
-      "  weight_decay: 1. "
-      "  weight_decay: 0. "
-      "  bottom: 'fc7' "
-      "  top: 'fc8' "
-      "} "
-      "layers { "
-      "  name: 'loss' "
-      "  type: SOFTMAX_LOSS "
-      "  bottom: 'fc8' "
-      "  bottom: 'label' "
-      "} ";
-  this->RunV0UpgradeTest(v0_proto, expected_v1_proto);
-
-  const string& expected_v2_proto =
-      "name: 'CaffeNet' "
-      "layer { "
-      "  name: 'data' "
-      "  type: 'Data' "
-      "  data_param { "
-      "    source: '/home/jiayq/Data/ILSVRC12/train-leveldb' "
-      "    batch_size: 256 "
-      "  } "
-      "  transform_param { "
-      "    crop_size: 227 "
-      "    mirror: true "
-      "    mean_file: '/home/jiayq/Data/ILSVRC12/image_mean.binaryproto' "
-      "  } "
-      "  top: 'data' "
-      "  top: 'label' "
-      "} "
-      "layer { "
-      "  name: 'conv1' "
-      "  type: 'Convolution' "
-      "  convolution_param { "
-      "    num_output: 96 "
-      "    kernel_size: 11 "
-      "    stride: 4 "
-      "    weight_filler { "
-      "      type: 'gaussian' "
-      "      std: 0.01 "
-      "    } "
-      "    bias_filler { "
-      "      type: 'constant' "
-      "      value: 0. "
-      "    } "
-      "  } "
-      "  param { "
-      "    lr_mult: 1 "
-      "    decay_mult: 1 "
-      "  } "
-      "  param { "
-      "    lr_mult: 2 "
-      "    decay_mult: 0 "
-      "  } "
-      "  bottom: 'data' "
-      "  top: 'conv1' "
-      "} "
-      "layer { "
-      "  name: 'relu1' "
-      "  type: 'ReLU' "
-      "  bottom: 'conv1' "
-      "  top: 'conv1' "
-      "} "
-      "layer { "
-      "  name: 'pool1' "
-      "  type: 'Pooling' "
-      "  pooling_param { "
-      "    pool: MAX "
-      "    kernel_size: 3 "
-      "    stride: 2 "
-      "  } "
-      "  bottom: 'conv1' "
-      "  top: 'pool1' "
-      "} "
-      "layer { "
-      "  name: 'norm1' "
-      "  type: 'LRN' "
-      "  lrn_param { "
-      "    local_size: 5 "
-      "    alpha: 0.0001 "
-      "    beta: 0.75 "
-      "  } "
-      "  bottom: 'pool1' "
-      "  top: 'norm1' "
-      "} "
-      "layer { "
-      "  name: 'conv2' "
-      "  type: 'Convolution' "
-      "  convolution_param { "
-      "    num_output: 256 "
-      "    group: 2 "
-      "    kernel_size: 5 "
-      "    pad: 2 "
-      "    weight_filler { "
-      "      type: 'gaussian' "
-      "      std: 0.01 "
-      "    } "
-      "    bias_filler { "
-      "      type: 'constant' "
-      "      value: 1. "
-      "    } "
-      "  } "
-      "  param { "
-      "    lr_mult: 1 "
-      "    decay_mult: 1 "
-      "  } "
-      "  param { "
-      "    lr_mult: 2 "
-      "    decay_mult: 0 "
-      "  } "
-      "  bottom: 'norm1' "
-      "  top: 'conv2' "
-      "} "
-      "layer { "
-      "  name: 'relu2' "
-      "  type: 'ReLU' "
-      "  bottom: 'conv2' "
-      "  top: 'conv2' "
-      "} "
-      "layer { "
-      "  name: 'pool2' "
-      "  type: 'Pooling' "
-      "  pooling_param { "
-      "    pool: MAX "
-      "    kernel_size: 3 "
-      "    stride: 2 "
-      "  } "
-      "  bottom: 'conv2' "
-      "  top: 'pool2' "
-      "} "
-      "layer { "
-      "  name: 'norm2' "
-      "  type: 'LRN' "
-      "  lrn_param { "
-      "    local_size: 5 "
-      "    alpha: 0.0001 "
-      "    beta: 0.75 "
-      "  } "
-      "  bottom: 'pool2' "
-      "  top: 'norm2' "
-      "} "
-      "layer { "
-      "  name: 'conv3' "
-      "  type: 'Convolution' "
-      "  convolution_param { "
-      "    num_output: 384 "
-      "    kernel_size: 3 "
-      "    pad: 1 "
-      "    weight_filler { "
-      "      type: 'gaussian' "
-      "      std: 0.01 "
-      "    } "
-      "    bias_filler { "
-      "      type: 'constant' "
-      "      value: 0. "
-      "    } "
-      "  } "
-      "  param { "
-      "    lr_mult: 1 "
-      "    decay_mult: 1 "
-      "  } "
-      "  param { "
-      "    lr_mult: 2 "
-      "    decay_mult: 0 "
-      "  } "
-      "  bottom: 'norm2' "
-      "  top: 'conv3' "
-      "} "
-      "layer { "
-      "  name: 'relu3' "
-      "  type: 'ReLU' "
-      "  bottom: 'conv3' "
-      "  top: 'conv3' "
-      "} "
-      "layer { "
-      "  name: 'conv4' "
-      "  type: 'Convolution' "
-      "  convolution_param { "
-      "    num_output: 384 "
-      "    group: 2 "
-      "    kernel_size: 3 "
-      "    pad: 1 "
-      "    weight_filler { "
-      "      type: 'gaussian' "
-      "      std: 0.01 "
-      "    } "
-      "    bias_filler { "
-      "      type: 'constant' "
-      "      value: 1. "
-      "    } "
-      "  } "
-      "  param { "
-      "    lr_mult: 1 "
-      "    decay_mult: 1 "
-      "  } "
-      "  param { "
-      "    lr_mult: 2 "
-      "    decay_mult: 0 "
-      "  } "
-      "  bottom: 'conv3' "
-      "  top: 'conv4' "
-      "} "
-      "layer { "
-      "  name: 'relu4' "
-      "  type: 'ReLU' "
-      "  bottom: 'conv4' "
-      "  top: 'conv4' "
-      "} "
-      "layer { "
-      "  name: 'conv5' "
-      "  type: 'Convolution' "
-      "  convolution_param { "
-      "    num_output: 256 "
-      "    group: 2 "
-      "    kernel_size: 3 "
-      "    pad: 1 "
-      "    weight_filler { "
-      "      type: 'gaussian' "
-      "      std: 0.01 "
-      "    } "
-      "    bias_filler { "
-      "      type: 'constant' "
-      "      value: 1. "
-      "    } "
-      "  } "
-      "  param { "
-      "    lr_mult: 1 "
-      "    decay_mult: 1 "
-      "  } "
-      "  param { "
-      "    lr_mult: 2 "
-      "    decay_mult: 0 "
-      "  } "
-      "  bottom: 'conv4' "
-      "  top: 'conv5' "
-      "} "
-      "layer { "
-      "  name: 'relu5' "
-      "  type: 'ReLU' "
-      "  bottom: 'conv5' "
-      "  top: 'conv5' "
-      "} "
-      "layer { "
-      "  name: 'pool5' "
-      "  type: 'Pooling' "
-      "  pooling_param { "
-      "    kernel_size: 3 "
-      "    pool: MAX "
-      "    stride: 2 "
-      "  } "
-      "  bottom: 'conv5' "
-      "  top: 'pool5' "
-      "} "
-      "layer { "
-      "  name: 'fc6' "
-      "  type: 'InnerProduct' "
-      "  inner_product_param { "
-      "    num_output: 4096 "
-      "    weight_filler { "
-      "      type: 'gaussian' "
-      "      std: 0.005 "
-      "    } "
-      "    bias_filler { "
-      "      type: 'constant' "
-      "      value: 1. "
-      "    } "
-      "  } "
-      "  param { "
-      "    lr_mult: 1 "
-      "    decay_mult: 1 "
-      "  } "
-      "  param { "
-      "    lr_mult: 2 "
-      "    decay_mult: 0 "
-      "  } "
-      "  bottom: 'pool5' "
-      "  top: 'fc6' "
-      "} "
-      "layer { "
-      "  name: 'relu6' "
-      "  type: 'ReLU' "
-      "  bottom: 'fc6' "
-      "  top: 'fc6' "
-      "} "
-      "layer { "
-      "  name: 'drop6' "
-      "  type: 'Dropout' "
-      "  dropout_param { "
-      "    dropout_ratio: 0.5 "
-      "  } "
-      "  bottom: 'fc6' "
-      "  top: 'fc6' "
-      "} "
-      "layer { "
-      "  name: 'fc7' "
-      "  type: 'InnerProduct' "
-      "  inner_product_param { "
-      "    num_output: 4096 "
-      "    weight_filler { "
-      "      type: 'gaussian' "
-      "      std: 0.005 "
-      "    } "
-      "    bias_filler { "
-      "      type: 'constant' "
-      "      value: 1. "
-      "    } "
-      "  } "
-      "  param { "
-      "    lr_mult: 1 "
-      "    decay_mult: 1 "
-      "  } "
-      "  param { "
-      "    lr_mult: 2 "
-      "    decay_mult: 0 "
-      "  } "
-      "  bottom: 'fc6' "
-      "  top: 'fc7' "
-      "} "
-      "layer { "
-      "  name: 'relu7' "
-      "  type: 'ReLU' "
-      "  bottom: 'fc7' "
-      "  top: 'fc7' "
-      "} "
-      "layer { "
-      "  name: 'drop7' "
-      "  type: 'Dropout' "
-      "  dropout_param { "
-      "    dropout_ratio: 0.5 "
-      "  } "
-      "  bottom: 'fc7' "
-      "  top: 'fc7' "
-      "} "
-      "layer { "
-      "  name: 'fc8' "
-      "  type: 'InnerProduct' "
-      "  inner_product_param { "
-      "    num_output: 1000 "
-      "    weight_filler { "
-      "      type: 'gaussian' "
-      "      std: 0.01 "
-      "    } "
-      "    bias_filler { "
-      "      type: 'constant' "
-      "      value: 0 "
-      "    } "
-      "  } "
-      "  param { "
-      "    lr_mult: 1 "
-      "    decay_mult: 1 "
-      "  } "
-      "  param { "
-      "    lr_mult: 2 "
-      "    decay_mult: 0 "
-      "  } "
-      "  bottom: 'fc7' "
-      "  top: 'fc8' "
-      "} "
-      "layer { "
-      "  name: 'loss' "
-      "  type: 'SoftmaxWithLoss' "
-      "  bottom: 'fc8' "
-      "  bottom: 'label' "
-      "} ";
-  this->RunV1UpgradeTest(expected_v1_proto, expected_v2_proto);
-}  // NOLINT(readability/fn_size)
-
-TEST_F(NetUpgradeTest, TestUpgradeV1LayerType) {
-  LayerParameter layer_param;
-  shared_ptr<Layer<float> > layer;
-  for (int i = 0; i < V1LayerParameter_LayerType_LayerType_ARRAYSIZE; ++i) {
-    ASSERT_TRUE(V1LayerParameter_LayerType_IsValid(i));
-    V1LayerParameter_LayerType v1_type = V1LayerParameter_LayerType(i);
-    string v2_layer_type(UpgradeV1LayerType(v1_type));
-    if (v2_layer_type == "") {
-      EXPECT_EQ(V1LayerParameter_LayerType_NONE, v1_type);
-      continue;  // Empty string isn't actually a valid layer type.
-    }
-    layer_param.set_type(v2_layer_type);
-    // Data layers expect a DB
-    if (v2_layer_type == "Data") {
-      #ifdef USE_LEVELDB
-      string tmp;
-      MakeTempDir(&tmp);
-      boost::scoped_ptr<db::DB> db(db::GetDB(DataParameter_DB_LEVELDB));
-      db->Open(tmp, db::NEW);
-      db->Close();
-      layer_param.mutable_data_param()->set_source(tmp);
-      #else
-      continue;
-      #endif  // USE_LEVELDB
-    }
-    #ifndef USE_OPENCV
-    if (v2_layer_type == "ImageData" || v2_layer_type == "WindowData") {
-     continue;
-    }
-    #endif  // !USE_OPENCV
-    layer = LayerRegistry<float>::CreateLayer(layer_param);
-    EXPECT_EQ(v2_layer_type, layer->type());
-  }
-}
-
-class SolverTypeUpgradeTest : public ::testing::Test {
- protected:
-  void RunSolverTypeUpgradeTest(
-      const string& input_param_string, const string& output_param_string) {
-    // Test upgrading old solver_type field (enum) to new type field (string)
-    SolverParameter input_param;
-    CHECK(google::protobuf::TextFormat::ParseFromString(
-        input_param_string, &input_param));
-    SolverParameter expected_output_param;
-    CHECK(google::protobuf::TextFormat::ParseFromString(
-        output_param_string, &expected_output_param));
-    SolverParameter actual_output_param = input_param;
-    UpgradeSolverType(&actual_output_param);
-    EXPECT_EQ(expected_output_param.DebugString(),
-        actual_output_param.DebugString());
-  }
-};
-
-TEST_F(SolverTypeUpgradeTest, TestSimple) {
-  const char* old_type_vec[6] = { "SGD", "ADAGRAD", "NESTEROV", "RMSPROP",
-      "ADADELTA", "ADAM" };
-  const char* new_type_vec[6] = { "SGD", "AdaGrad", "Nesterov", "RMSProp",
-      "AdaDelta", "Adam" };
-  for (int i = 0; i < 6; ++i) {
-    const string& input_proto =
-        "net: 'examples/mnist/lenet_train_test.prototxt' "
-        "test_iter: 100 "
-        "test_interval: 500 "
-        "base_lr: 0.01 "
-        "momentum: 0.0 "
-        "weight_decay: 0.0005 "
-        "lr_policy: 'inv' "
-        "gamma: 0.0001 "
-        "power: 0.75 "
-        "display: 100 "
-        "max_iter: 10000 "
-        "snapshot: 5000 "
-        "snapshot_prefix: 'examples/mnist/lenet_rmsprop' "
-        "solver_mode: GPU "
-        "solver_type: " + std::string(old_type_vec[i]) + " ";
-    const string& expected_output_proto =
-        "net: 'examples/mnist/lenet_train_test.prototxt' "
-        "test_iter: 100 "
-        "test_interval: 500 "
-        "base_lr: 0.01 "
-        "momentum: 0.0 "
-        "weight_decay: 0.0005 "
-        "lr_policy: 'inv' "
-        "gamma: 0.0001 "
-        "power: 0.75 "
-        "display: 100 "
-        "max_iter: 10000 "
-        "snapshot: 5000 "
-        "snapshot_prefix: 'examples/mnist/lenet_rmsprop' "
-        "solver_mode: GPU "
-        "type: '" + std::string(new_type_vec[i]) + "' ";
-    this->RunSolverTypeUpgradeTest(input_proto, expected_output_proto);
-  }
-}
-
-}  // NOLINT(readability/fn_size)  // namespace caffe
diff --git a/src/caffe/test/test_util_blas.cpp b/src/caffe/test/test_util_blas.cpp
deleted file mode 100644
index 9ee8818..0000000
--- a/src/caffe/test/test_util_blas.cpp
+++ /dev/null
@@ -1,132 +0,0 @@
-#ifndef CPU_ONLY  // CPU-GPU test
-
-#include "gtest/gtest.h"
-
-#include "caffe/blob.hpp"
-#include "caffe/util/device_alternate.hpp"
-#include "caffe/util/math_functions.hpp"
-
-#include "caffe/test/test_caffe_main.hpp"
-
-namespace caffe {
-
-extern cudaDeviceProp CAFFE_TEST_CUDA_PROP;
-
-template <typename TypeParam>
-class GemmTest : public ::testing::Test {};
-
-TYPED_TEST_CASE(GemmTest, TestDtypes);
-
-TYPED_TEST(GemmTest, TestGemmCPUGPU) {
-  Blob<TypeParam> A(1, 1, 2, 3);
-  Blob<TypeParam> B(1, 1, 3, 4);
-  Blob<TypeParam> C(1, 1, 2, 4);
-  TypeParam data[12] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
-  TypeParam A_reshape_data[6] = {1, 4, 2, 5, 3, 6};
-  TypeParam B_reshape_data[12] = {1, 5, 9, 2, 6, 10, 3, 7, 11, 4, 8, 12};
-  TypeParam result[8] = {38, 44, 50, 56, 83, 98, 113, 128};
-  caffe_copy(6, data, A.mutable_cpu_data());
-  caffe_copy(12, data, B.mutable_cpu_data());
-
-  if (sizeof(TypeParam) == 4 || CAFFE_TEST_CUDA_PROP.major >= 2) {
-    // [1, 2, 3; 4 5 6] * [1, 2, 3, 4; 5, 6, 7, 8; 9, 10, 11, 12];
-    caffe_cpu_gemm<TypeParam>(CblasNoTrans, CblasNoTrans, 2, 4, 3, 1.,
-        A.cpu_data(), B.cpu_data(), 0., C.mutable_cpu_data());
-    for (int i = 0; i < 8; ++i) {
-      EXPECT_EQ(C.cpu_data()[i], result[i]);
-    }
-    caffe_gpu_gemm<TypeParam>(CblasNoTrans, CblasNoTrans, 2, 4, 3, 1.,
-        A.gpu_data(), B.gpu_data(), 0., C.mutable_gpu_data());
-    for (int i = 0; i < 8; ++i) {
-      EXPECT_EQ(C.cpu_data()[i], result[i]);
-    }
-
-    // Test when we have a transposed A
-    A.Reshape(1, 1, 3, 2);
-    caffe_copy(6, A_reshape_data, A.mutable_cpu_data());
-    caffe_cpu_gemm<TypeParam>(CblasTrans, CblasNoTrans, 2, 4, 3, 1.,
-        A.cpu_data(), B.cpu_data(), 0., C.mutable_cpu_data());
-    for (int i = 0; i < 8; ++i) {
-      EXPECT_EQ(C.cpu_data()[i], result[i]);
-    }
-    caffe_gpu_gemm<TypeParam>(CblasTrans, CblasNoTrans, 2, 4, 3, 1.,
-        A.gpu_data(), B.gpu_data(), 0., C.mutable_gpu_data());
-    for (int i = 0; i < 8; ++i) {
-      EXPECT_EQ(C.cpu_data()[i], result[i]);
-    }
-
-    // Test when we have a transposed A and a transposed B too
-    B.Reshape(1, 1, 4, 3);
-    caffe_copy(12, B_reshape_data, B.mutable_cpu_data());
-    caffe_cpu_gemm<TypeParam>(CblasTrans, CblasTrans, 2, 4, 3, 1.,
-        A.cpu_data(), B.cpu_data(), 0., C.mutable_cpu_data());
-    for (int i = 0; i < 8; ++i) {
-      EXPECT_EQ(C.cpu_data()[i], result[i]);
-    }
-    caffe_gpu_gemm<TypeParam>(CblasTrans, CblasTrans, 2, 4, 3, 1.,
-        A.gpu_data(), B.gpu_data(), 0., C.mutable_gpu_data());
-    for (int i = 0; i < 8; ++i) {
-      EXPECT_EQ(C.cpu_data()[i], result[i]);
-    }
-
-    // Test when we have a transposed B
-    A.Reshape(1, 1, 2, 3);
-    caffe_copy(6, data, A.mutable_cpu_data());
-    caffe_cpu_gemm<TypeParam>(CblasNoTrans, CblasTrans, 2, 4, 3, 1.,
-        A.cpu_data(), B.cpu_data(), 0., C.mutable_cpu_data());
-    for (int i = 0; i < 8; ++i) {
-      EXPECT_EQ(C.cpu_data()[i], result[i]);
-    }
-    caffe_gpu_gemm<TypeParam>(CblasNoTrans, CblasTrans, 2, 4, 3, 1.,
-        A.gpu_data(), B.gpu_data(), 0., C.mutable_gpu_data());
-    for (int i = 0; i < 8; ++i) {
-      EXPECT_EQ(C.cpu_data()[i], result[i]);
-    }
-  } else {
-    LOG(ERROR) << "Skipping test due to old architecture.";
-  }
-}
-
-
-TYPED_TEST(GemmTest, TestGemvCPUGPU) {
-  Blob<TypeParam> A(1, 1, 2, 3);
-  Blob<TypeParam> x(1, 1, 1, 3);
-  Blob<TypeParam> y(1, 1, 1, 2);
-  TypeParam data[6] = {1, 2, 3, 4, 5, 6};
-  TypeParam result_2[2] = {14, 32};
-  TypeParam result_3[3] = {9, 12, 15};
-  caffe_copy(6, data, A.mutable_cpu_data());
-  caffe_copy(3, data, x.mutable_cpu_data());
-
-  if (sizeof(TypeParam) == 4 || CAFFE_TEST_CUDA_PROP.major >= 2) {
-    caffe_cpu_gemv<TypeParam>(CblasNoTrans, 2, 3, 1., A.cpu_data(),
-        x.cpu_data(), 0., y.mutable_cpu_data());
-    for (int i = 0; i < 2; ++i) {
-      EXPECT_EQ(y.cpu_data()[i], result_2[i]);
-    }
-    caffe_gpu_gemv<TypeParam>(CblasNoTrans, 2, 3, 1., A.gpu_data(),
-        x.gpu_data(), 0., y.mutable_gpu_data());
-    for (int i = 0; i < 2; ++i) {
-      EXPECT_EQ(y.cpu_data()[i], result_2[i]);
-    }
-
-    // Test transpose case
-    caffe_copy(2, data, y.mutable_cpu_data());
-    caffe_cpu_gemv<TypeParam>(CblasTrans, 2, 3, 1., A.cpu_data(),
-        y.cpu_data(), 0., x.mutable_cpu_data());
-    for (int i = 0; i < 3; ++i) {
-      EXPECT_EQ(x.cpu_data()[i], result_3[i]);
-    }
-    caffe_gpu_gemv<TypeParam>(CblasTrans, 2, 3, 1., A.gpu_data(),
-        y.gpu_data(), 0., x.mutable_gpu_data());
-    for (int i = 0; i < 3; ++i) {
-      EXPECT_EQ(x.cpu_data()[i], result_3[i]);
-    }
-  } else {
-    LOG(ERROR) << "Skipping test due to old architecture.";
-  }
-}
-
-}  // namespace caffe
-
-#endif  // CPU_ONLY
diff --git a/src/caffe/util/benchmark.cpp b/src/caffe/util/benchmark.cpp
deleted file mode 100644
index 1d269c3..0000000
--- a/src/caffe/util/benchmark.cpp
+++ /dev/null
@@ -1,168 +0,0 @@
-#include <boost/date_time/posix_time/posix_time.hpp>
-
-#include "caffe/common.hpp"
-#include "caffe/util/benchmark.hpp"
-
-namespace caffe {
-
-Timer::Timer()
-    : initted_(false),
-      running_(false),
-      has_run_at_least_once_(false) {
-  Init();
-}
-
-Timer::~Timer() {
-  if (Caffe::mode() == Caffe::GPU) {
-#ifndef CPU_ONLY
-    CUDA_CHECK(cudaEventDestroy(start_gpu_));
-    CUDA_CHECK(cudaEventDestroy(stop_gpu_));
-#else
-    NO_GPU;
-#endif
-  }
-}
-
-void Timer::Start() {
-  if (!running()) {
-    if (Caffe::mode() == Caffe::GPU) {
-#ifndef CPU_ONLY
-      CUDA_CHECK(cudaEventRecord(start_gpu_, 0));
-#else
-      NO_GPU;
-#endif
-    } else {
-      start_cpu_ = boost::posix_time::microsec_clock::local_time();
-    }
-    running_ = true;
-    has_run_at_least_once_ = true;
-  }
-}
-
-void Timer::Stop() {
-  if (running()) {
-    if (Caffe::mode() == Caffe::GPU) {
-#ifndef CPU_ONLY
-      CUDA_CHECK(cudaEventRecord(stop_gpu_, 0));
-      CUDA_CHECK(cudaEventSynchronize(stop_gpu_));
-#else
-      NO_GPU;
-#endif
-    } else {
-      stop_cpu_ = boost::posix_time::microsec_clock::local_time();
-    }
-    running_ = false;
-  }
-}
-
-
-float Timer::MicroSeconds() {
-  if (!has_run_at_least_once()) {
-    LOG(WARNING) << "Timer has never been run before reading time.";
-    return 0;
-  }
-  if (running()) {
-    Stop();
-  }
-  if (Caffe::mode() == Caffe::GPU) {
-#ifndef CPU_ONLY
-    CUDA_CHECK(cudaEventElapsedTime(&elapsed_milliseconds_, start_gpu_,
-                                    stop_gpu_));
-    // Cuda only measure milliseconds
-    elapsed_microseconds_ = elapsed_milliseconds_ * 1000;
-#else
-      NO_GPU;
-#endif
-  } else {
-    elapsed_microseconds_ = (stop_cpu_ - start_cpu_).total_microseconds();
-  }
-  return elapsed_microseconds_;
-}
-
-float Timer::MilliSeconds() {
-  if (!has_run_at_least_once()) {
-    LOG(WARNING) << "Timer has never been run before reading time.";
-    return 0;
-  }
-  if (running()) {
-    Stop();
-  }
-  if (Caffe::mode() == Caffe::GPU) {
-#ifndef CPU_ONLY
-    CUDA_CHECK(cudaEventElapsedTime(&elapsed_milliseconds_, start_gpu_,
-                                    stop_gpu_));
-#else
-      NO_GPU;
-#endif
-  } else {
-    elapsed_milliseconds_ = (stop_cpu_ - start_cpu_).total_milliseconds();
-  }
-  return elapsed_milliseconds_;
-}
-
-float Timer::Seconds() {
-  return MilliSeconds() / 1000.;
-}
-
-void Timer::Init() {
-  if (!initted()) {
-    if (Caffe::mode() == Caffe::GPU) {
-#ifndef CPU_ONLY
-      CUDA_CHECK(cudaEventCreate(&start_gpu_));
-      CUDA_CHECK(cudaEventCreate(&stop_gpu_));
-#else
-      NO_GPU;
-#endif
-    }
-    initted_ = true;
-  }
-}
-
-CPUTimer::CPUTimer() {
-  this->initted_ = true;
-  this->running_ = false;
-  this->has_run_at_least_once_ = false;
-}
-
-void CPUTimer::Start() {
-  if (!running()) {
-    this->start_cpu_ = boost::posix_time::microsec_clock::local_time();
-    this->running_ = true;
-    this->has_run_at_least_once_ = true;
-  }
-}
-
-void CPUTimer::Stop() {
-  if (running()) {
-    this->stop_cpu_ = boost::posix_time::microsec_clock::local_time();
-    this->running_ = false;
-  }
-}
-
-float CPUTimer::MilliSeconds() {
-  if (!has_run_at_least_once()) {
-    LOG(WARNING) << "Timer has never been run before reading time.";
-    return 0;
-  }
-  if (running()) {
-    Stop();
-  }
-  this->elapsed_milliseconds_ = (this->stop_cpu_ -
-                                this->start_cpu_).total_milliseconds();
-  return this->elapsed_milliseconds_;
-}
-
-float CPUTimer::MicroSeconds() {
-  if (!has_run_at_least_once()) {
-    LOG(WARNING) << "Timer has never been run before reading time.";
-    return 0;
-  }
-  if (running()) {
-    Stop();
-  }
-  this->elapsed_microseconds_ = (this->stop_cpu_ -
-                                this->start_cpu_).total_microseconds();
-  return this->elapsed_microseconds_;
-}
-
-}  // namespace caffe
diff --git a/src/caffe/util/blocking_queue.cpp b/src/caffe/util/blocking_queue.cpp
deleted file mode 100644
index 058668f..0000000
--- a/src/caffe/util/blocking_queue.cpp
+++ /dev/null
@@ -1,96 +0,0 @@
-#include <boost/thread.hpp>
-#include <string>
-
-#include "caffe/data_reader.hpp"
-#include "caffe/layers/base_data_layer.hpp"
-#include "caffe/parallel.hpp"
-#include "caffe/util/blocking_queue.hpp"
-
-namespace caffe {
-
-template<typename T>
-class BlockingQueue<T>::sync {
- public:
-  mutable boost::mutex mutex_;
-  boost::condition_variable condition_;
-};
-
-template<typename T>
-BlockingQueue<T>::BlockingQueue()
-    : sync_(new sync()) {
-}
-
-template<typename T>
-void BlockingQueue<T>::push(const T& t) {
-  boost::mutex::scoped_lock lock(sync_->mutex_);
-  queue_.push(t);
-  lock.unlock();
-  sync_->condition_.notify_one();
-}
-
-template<typename T>
-bool BlockingQueue<T>::try_pop(T* t) {
-  boost::mutex::scoped_lock lock(sync_->mutex_);
-
-  if (queue_.empty()) {
-    return false;
-  }
-
-  *t = queue_.front();
-  queue_.pop();
-  return true;
-}
-
-template<typename T>
-T BlockingQueue<T>::pop(const string& log_on_wait) {
-  boost::mutex::scoped_lock lock(sync_->mutex_);
-
-  while (queue_.empty()) {
-    if (!log_on_wait.empty()) {
-      LOG_EVERY_N(INFO, 1000)<< log_on_wait;
-    }
-    sync_->condition_.wait(lock);
-  }
-
-  T t = queue_.front();
-  queue_.pop();
-  return t;
-}
-
-template<typename T>
-bool BlockingQueue<T>::try_peek(T* t) {
-  boost::mutex::scoped_lock lock(sync_->mutex_);
-
-  if (queue_.empty()) {
-    return false;
-  }
-
-  *t = queue_.front();
-  return true;
-}
-
-template<typename T>
-T BlockingQueue<T>::peek() {
-  boost::mutex::scoped_lock lock(sync_->mutex_);
-
-  while (queue_.empty()) {
-    sync_->condition_.wait(lock);
-  }
-
-  return queue_.front();
-}
-
-template<typename T>
-size_t BlockingQueue<T>::size() const {
-  boost::mutex::scoped_lock lock(sync_->mutex_);
-  return queue_.size();
-}
-
-template class BlockingQueue<Batch<float>*>;
-template class BlockingQueue<Batch<double>*>;
-template class BlockingQueue<Datum*>;
-template class BlockingQueue<shared_ptr<DataReader::QueuePair> >;
-template class BlockingQueue<P2PSync<float>*>;
-template class BlockingQueue<P2PSync<double>*>;
-
-}  // namespace caffe
diff --git a/src/caffe/util/cudnn.cpp b/src/caffe/util/cudnn.cpp
deleted file mode 100644
index 1772f00..0000000
--- a/src/caffe/util/cudnn.cpp
+++ /dev/null
@@ -1,23 +0,0 @@
-#ifdef USE_CUDNN
-#include "caffe/util/cudnn.hpp"
-
-namespace caffe {
-namespace cudnn {
-
-float dataType<float>::oneval = 1.0;
-float dataType<float>::zeroval = 0.0;
-const void* dataType<float>::one =
-    static_cast<void *>(&dataType<float>::oneval);
-const void* dataType<float>::zero =
-    static_cast<void *>(&dataType<float>::zeroval);
-
-double dataType<double>::oneval = 1.0;
-double dataType<double>::zeroval = 0.0;
-const void* dataType<double>::one =
-    static_cast<void *>(&dataType<double>::oneval);
-const void* dataType<double>::zero =
-    static_cast<void *>(&dataType<double>::zeroval);
-
-}  // namespace cudnn
-}  // namespace caffe
-#endif
diff --git a/src/caffe/util/db.cpp b/src/caffe/util/db.cpp
deleted file mode 100644
index 7f22509..0000000
--- a/src/caffe/util/db.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-#include "caffe/util/db.hpp"
-#include "caffe/util/db_leveldb.hpp"
-#include "caffe/util/db_lmdb.hpp"
-
-#include <string>
-
-namespace caffe { namespace db {
-
-DB* GetDB(DataParameter::DB backend) {
-  switch (backend) {
-#ifdef USE_LEVELDB
-  case DataParameter_DB_LEVELDB:
-    return new LevelDB();
-#endif  // USE_LEVELDB
-#ifdef USE_LMDB
-  case DataParameter_DB_LMDB:
-    return new LMDB();
-#endif  // USE_LMDB
-  default:
-    LOG(FATAL) << "Unknown database backend";
-    return NULL;
-  }
-}
-
-DB* GetDB(const string& backend) {
-#ifdef USE_LEVELDB
-  if (backend == "leveldb") {
-    return new LevelDB();
-  }
-#endif  // USE_LEVELDB
-#ifdef USE_LMDB
-  if (backend == "lmdb") {
-    return new LMDB();
-  }
-#endif  // USE_LMDB
-  LOG(FATAL) << "Unknown database backend";
-  return NULL;
-}
-
-}  // namespace db
-}  // namespace caffe
diff --git a/src/caffe/util/db_leveldb.cpp b/src/caffe/util/db_leveldb.cpp
deleted file mode 100644
index f5c4d8a..0000000
--- a/src/caffe/util/db_leveldb.cpp
+++ /dev/null
@@ -1,23 +0,0 @@
-#ifdef USE_LEVELDB
-#include "caffe/util/db_leveldb.hpp"
-
-#include <string>
-
-namespace caffe { namespace db {
-
-void LevelDB::Open(const string& source, Mode mode) {
-  leveldb::Options options;
-  options.block_size = 65536;
-  options.write_buffer_size = 268435456;
-  options.max_open_files = 100;
-  options.error_if_exists = mode == NEW;
-  options.create_if_missing = mode != READ;
-  leveldb::Status status = leveldb::DB::Open(options, source, &db_);
-  CHECK(status.ok()) << "Failed to open leveldb " << source
-                     << std::endl << status.ToString();
-  LOG(INFO) << "Opened leveldb " << source;
-}
-
-}  // namespace db
-}  // namespace caffe
-#endif  // USE_LEVELDB
diff --git a/src/caffe/util/db_lmdb.cpp b/src/caffe/util/db_lmdb.cpp
deleted file mode 100644
index fb1d495..0000000
--- a/src/caffe/util/db_lmdb.cpp
+++ /dev/null
@@ -1,113 +0,0 @@
-#ifdef USE_LMDB
-#include "caffe/util/db_lmdb.hpp"
-
-#include <sys/stat.h>
-
-#include <string>
-
-namespace caffe { namespace db {
-
-void LMDB::Open(const string& source, Mode mode) {
-  MDB_CHECK(mdb_env_create(&mdb_env_));
-  if (mode == NEW) {
-    CHECK_EQ(mkdir(source.c_str(), 0744), 0) << "mkdir " << source << " failed";
-  }
-  int flags = 0;
-  if (mode == READ) {
-    flags = MDB_RDONLY | MDB_NOTLS;
-  }
-  int rc = mdb_env_open(mdb_env_, source.c_str(), flags, 0664);
-#ifndef ALLOW_LMDB_NOLOCK
-  MDB_CHECK(rc);
-#else
-  if (rc == EACCES) {
-    LOG(WARNING) << "Permission denied. Trying with MDB_NOLOCK ...";
-    // Close and re-open environment handle
-    mdb_env_close(mdb_env_);
-    MDB_CHECK(mdb_env_create(&mdb_env_));
-    // Try again with MDB_NOLOCK
-    flags |= MDB_NOLOCK;
-    MDB_CHECK(mdb_env_open(mdb_env_, source.c_str(), flags, 0664));
-  } else {
-    MDB_CHECK(rc);
-  }
-#endif
-  LOG(INFO) << "Opened lmdb " << source;
-}
-
-LMDBCursor* LMDB::NewCursor() {
-  MDB_txn* mdb_txn;
-  MDB_cursor* mdb_cursor;
-  MDB_CHECK(mdb_txn_begin(mdb_env_, NULL, MDB_RDONLY, &mdb_txn));
-  MDB_CHECK(mdb_dbi_open(mdb_txn, NULL, 0, &mdb_dbi_));
-  MDB_CHECK(mdb_cursor_open(mdb_txn, mdb_dbi_, &mdb_cursor));
-  return new LMDBCursor(mdb_txn, mdb_cursor);
-}
-
-LMDBTransaction* LMDB::NewTransaction() {
-  return new LMDBTransaction(mdb_env_);
-}
-
-void LMDBTransaction::Put(const string& key, const string& value) {
-  keys.push_back(key);
-  values.push_back(value);
-}
-
-void LMDBTransaction::Commit() {
-  MDB_dbi mdb_dbi;
-  MDB_val mdb_key, mdb_data;
-  MDB_txn *mdb_txn;
-
-  // Initialize MDB variables
-  MDB_CHECK(mdb_txn_begin(mdb_env_, NULL, 0, &mdb_txn));
-  MDB_CHECK(mdb_dbi_open(mdb_txn, NULL, 0, &mdb_dbi));
-
-  for (int i = 0; i < keys.size(); i++) {
-    mdb_key.mv_size = keys[i].size();
-    mdb_key.mv_data = const_cast<char*>(keys[i].data());
-    mdb_data.mv_size = values[i].size();
-    mdb_data.mv_data = const_cast<char*>(values[i].data());
-
-    // Add data to the transaction
-    int put_rc = mdb_put(mdb_txn, mdb_dbi, &mdb_key, &mdb_data, 0);
-    if (put_rc == MDB_MAP_FULL) {
-      // Out of memory - double the map size and retry
-      mdb_txn_abort(mdb_txn);
-      mdb_dbi_close(mdb_env_, mdb_dbi);
-      DoubleMapSize();
-      Commit();
-      return;
-    }
-    // May have failed for some other reason
-    MDB_CHECK(put_rc);
-  }
-
-  // Commit the transaction
-  int commit_rc = mdb_txn_commit(mdb_txn);
-  if (commit_rc == MDB_MAP_FULL) {
-    // Out of memory - double the map size and retry
-    mdb_dbi_close(mdb_env_, mdb_dbi);
-    DoubleMapSize();
-    Commit();
-    return;
-  }
-  // May have failed for some other reason
-  MDB_CHECK(commit_rc);
-
-  // Cleanup after successful commit
-  mdb_dbi_close(mdb_env_, mdb_dbi);
-  keys.clear();
-  values.clear();
-}
-
-void LMDBTransaction::DoubleMapSize() {
-  struct MDB_envinfo current_info;
-  MDB_CHECK(mdb_env_info(mdb_env_, &current_info));
-  size_t new_size = current_info.me_mapsize * 2;
-  DLOG(INFO) << "Doubling LMDB map size to " << (new_size>>20) << "MB ...";
-  MDB_CHECK(mdb_env_set_mapsize(mdb_env_, new_size));
-}
-
-}  // namespace db
-}  // namespace caffe
-#endif  // USE_LMDB
diff --git a/src/caffe/util/im2col.cu b/src/caffe/util/im2col.cu
deleted file mode 100644
index a8f30a0..0000000
--- a/src/caffe/util/im2col.cu
+++ /dev/null
@@ -1,512 +0,0 @@
-#include <algorithm>
-
-#include "caffe/common.hpp"
-#include "caffe/util/im2col.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-__global__ void im2col_gpu_kernel(const int n, const Dtype* data_im,
-    const int height, const int width, const int kernel_h, const int kernel_w,
-    const int pad_h, const int pad_w,
-    const int stride_h, const int stride_w,
-    const int dilation_h, const int dilation_w,
-    const int height_col, const int width_col,
-    Dtype* data_col) {
-  CUDA_KERNEL_LOOP(index, n) {
-    const int h_index = index / width_col;
-    const int h_col = h_index % height_col;
-    const int w_col = index % width_col;
-    const int c_im = h_index / height_col;
-    const int c_col = c_im * kernel_h * kernel_w;
-    const int h_offset = h_col * stride_h - pad_h;
-    const int w_offset = w_col * stride_w - pad_w;
-    Dtype* data_col_ptr = data_col;
-    data_col_ptr += (c_col * height_col + h_col) * width_col + w_col;
-    const Dtype* data_im_ptr = data_im;
-    data_im_ptr += (c_im * height + h_offset) * width + w_offset;
-    for (int i = 0; i < kernel_h; ++i) {
-      for (int j = 0; j < kernel_w; ++j) {
-        int h_im = h_offset + i * dilation_h;
-        int w_im = w_offset + j * dilation_w;
-        *data_col_ptr =
-            (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) ?
-            data_im_ptr[i * dilation_h * width + j * dilation_w] : 0;
-        data_col_ptr += height_col * width_col;
-      }
-    }
-  }
-}
-
-template <typename Dtype>
-void im2col_gpu(const Dtype* data_im, const int channels,
-    const int height, const int width, const int kernel_h, const int kernel_w,
-    const int pad_h, const int pad_w,
-    const int stride_h, const int stride_w,
-    const int dilation_h, const int dilation_w,
-    Dtype* data_col) {
-  // We are going to launch channels * height_col * width_col kernels, each
-  // kernel responsible for copying a single-channel grid.
-  int height_col = (height + 2 * pad_h -
-      (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
-  int width_col = (width + 2 * pad_w -
-      (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
-  int num_kernels = channels * height_col * width_col;
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  im2col_gpu_kernel<Dtype><<<CAFFE_GET_BLOCKS(num_kernels),
-                             CAFFE_CUDA_NUM_THREADS>>>(
-      num_kernels, data_im, height, width, kernel_h, kernel_w, pad_h,
-      pad_w, stride_h, stride_w, dilation_h, dilation_w, height_col,
-      width_col, data_col);
-  CUDA_POST_KERNEL_CHECK;
-}
-
-// Explicit instantiation
-template void im2col_gpu<float>(const float* data_im, const int channels,
-    const int height, const int width, const int kernel_h, const int kernel_w,
-    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
-    const int dilation_h, const int dilation_w, float* data_col);
-template void im2col_gpu<double>(const double* data_im, const int channels,
-    const int height, const int width, const int kernel_h, const int kernel_w,
-    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
-    const int dilation_h, const int dilation_w, double* data_col);
-
-template <typename Dtype, int num_axes>
-__global__ void im2col_nd_gpu_kernel(const int n, const Dtype* data_im,
-    const int* im_shape, const int* col_shape,
-    const int* kernel_shape, const int* pad, const int* stride,
-    const int* dilation, Dtype* data_col) {
-  int d_temp[num_axes];  // NOLINT(runtime/arrays)
-  int d_iter[num_axes];  // NOLINT(runtime/arrays)
-
-  __shared__ int shared_dilation[num_axes];
-  __shared__ int shared_kernel_shape[num_axes];
-  __shared__ int shared_pad[num_axes];
-  __shared__ int shared_stride[num_axes];
-  __shared__ int shared_col_shape[num_axes + 1];
-  __shared__ int shared_im_shape[num_axes + 1];
-
-  if (threadIdx.x < num_axes) {
-    shared_dilation[threadIdx.x] = dilation[threadIdx.x];
-    shared_kernel_shape[threadIdx.x] = kernel_shape[threadIdx.x];
-    shared_pad[threadIdx.x] = pad[threadIdx.x];
-    shared_stride[threadIdx.x] = stride[threadIdx.x];
-  }
-  if (threadIdx.x < num_axes + 1) {
-    shared_col_shape[threadIdx.x] = col_shape[threadIdx.x];
-    shared_im_shape[threadIdx.x] = im_shape[threadIdx.x];
-  }
-  __syncthreads();
-
-  int i;
-  CUDA_KERNEL_LOOP(index, n) {
-    // Initialize channel_in, computed in the loop below, with intermediate
-    // computations used to compute the spatial indices.
-    int channel_in = index;
-    int channel_out = 1;
-    for (i = num_axes - 1; i >= 0; --i) {
-      d_temp[i] = channel_in % shared_col_shape[i + 1];
-      channel_in /= shared_col_shape[i + 1];
-      channel_out *= shared_kernel_shape[i];
-    }
-    channel_out *= channel_in;
-    int data_col_inc = 1;
-    for (i = 0; i < num_axes; ++i) {
-      channel_out *= shared_col_shape[i + 1];
-      channel_out += d_temp[i];
-      d_temp[i] = d_temp[i] * shared_stride[i] - shared_pad[i];
-      channel_in *= shared_im_shape[i + 1];
-      channel_in += d_temp[i];
-      data_col_inc *= shared_col_shape[i + 1];
-      d_iter[i] = 0;
-    }
-    Dtype* data_col_ptr = data_col + channel_out;
-    const Dtype* data_im_ptr = data_im + channel_in;
-    bool incremented;
-    do {
-      bool in_range = true;
-      for (i = 0; i < num_axes; ++i) {
-        const int d_iter_im = d_iter[i] * shared_dilation[i] + d_temp[i];
-        in_range &= d_iter_im >= 0 && d_iter_im < shared_im_shape[i + 1];
-        if (!in_range) { break; }
-      }
-      if (in_range) {
-        int data_im_offset = d_iter[0] * shared_dilation[0];
-        for (i = 1; i < num_axes; ++i) {
-          data_im_offset *= shared_im_shape[i + 1];
-          data_im_offset += d_iter[i] * shared_dilation[i];
-        }
-        *data_col_ptr = data_im_ptr[data_im_offset];
-      } else {
-        *data_col_ptr = 0;
-      }
-      data_col_ptr += data_col_inc;
-      incremented = false;
-      for (i = num_axes - 1; i >= 0; --i) {
-        const int d_max = shared_kernel_shape[i];
-        if (d_iter[i] == d_max - 1) {
-          d_iter[i] = 0;
-        } else {  // d_iter[i] < d_max - 1
-          ++d_iter[i];
-          incremented = true;
-          break;
-        }
-      }  // for (int i = num_axes - 1; i >= 0; --i)
-    } while (incremented);  // do
-  }  // CUDA_KERNEL_LOOP(index, n)
-}
-
-template <typename Dtype>
-void im2col_nd_gpu(const Dtype* data_im, const int num_spatial_axes,
-    const int num_kernels, const int* im_shape, const int* col_shape,
-    const int* kernel_shape, const int* pad, const int* stride,
-    const int* dilation, Dtype* data_col) {
-  // num_axes should be smaller than block size
-  DCHECK_LT(num_spatial_axes, CAFFE_CUDA_NUM_THREADS);
-  switch (num_spatial_axes) {
-  case 1:
-    im2col_nd_gpu_kernel<Dtype, 1>  // NOLINT_NEXT_LINE(whitespace/operators)
-        <<<CAFFE_GET_BLOCKS(num_kernels), CAFFE_CUDA_NUM_THREADS>>>(
-        num_kernels, data_im, im_shape, col_shape,
-        kernel_shape, pad, stride, dilation, data_col);
-    break;
-  case 2:
-    im2col_nd_gpu_kernel<Dtype, 2>  // NOLINT_NEXT_LINE(whitespace/operators)
-        <<<CAFFE_GET_BLOCKS(num_kernels), CAFFE_CUDA_NUM_THREADS>>>(
-        num_kernels, data_im, im_shape, col_shape,
-        kernel_shape, pad, stride, dilation, data_col);
-    break;
-  case 3:
-    im2col_nd_gpu_kernel<Dtype, 3>  // NOLINT_NEXT_LINE(whitespace/operators)
-        <<<CAFFE_GET_BLOCKS(num_kernels), CAFFE_CUDA_NUM_THREADS>>>(
-        num_kernels, data_im, im_shape, col_shape,
-        kernel_shape, pad, stride, dilation, data_col);
-    break;
-  case 4:
-    im2col_nd_gpu_kernel<Dtype, 4>  // NOLINT_NEXT_LINE(whitespace/operators)
-        <<<CAFFE_GET_BLOCKS(num_kernels), CAFFE_CUDA_NUM_THREADS>>>(
-        num_kernels, data_im, im_shape, col_shape,
-        kernel_shape, pad, stride, dilation, data_col);
-    break;
-  case 5:
-    im2col_nd_gpu_kernel<Dtype, 5>  // NOLINT_NEXT_LINE(whitespace/operators)
-        <<<CAFFE_GET_BLOCKS(num_kernels), CAFFE_CUDA_NUM_THREADS>>>(
-        num_kernels, data_im, im_shape, col_shape,
-        kernel_shape, pad, stride, dilation, data_col);
-    break;
-  case 6:
-    im2col_nd_gpu_kernel<Dtype, 6>  // NOLINT_NEXT_LINE(whitespace/operators)
-        <<<CAFFE_GET_BLOCKS(num_kernels), CAFFE_CUDA_NUM_THREADS>>>(
-        num_kernels, data_im, im_shape, col_shape,
-        kernel_shape, pad, stride, dilation, data_col);
-    break;
-  case 7:
-    im2col_nd_gpu_kernel<Dtype, 7>  // NOLINT_NEXT_LINE(whitespace/operators)
-        <<<CAFFE_GET_BLOCKS(num_kernels), CAFFE_CUDA_NUM_THREADS>>>(
-        num_kernels, data_im, im_shape, col_shape,
-        kernel_shape, pad, stride, dilation, data_col);
-    break;
-  case 8:
-    im2col_nd_gpu_kernel<Dtype, 8>  // NOLINT_NEXT_LINE(whitespace/operators)
-        <<<CAFFE_GET_BLOCKS(num_kernels), CAFFE_CUDA_NUM_THREADS>>>(
-        num_kernels, data_im, im_shape, col_shape,
-        kernel_shape, pad, stride, dilation, data_col);
-    break;
-  case 9:
-    im2col_nd_gpu_kernel<Dtype, 9>  // NOLINT_NEXT_LINE(whitespace/operators)
-        <<<CAFFE_GET_BLOCKS(num_kernels), CAFFE_CUDA_NUM_THREADS>>>(
-        num_kernels, data_im, im_shape, col_shape,
-        kernel_shape, pad, stride, dilation, data_col);
-    break;
-  case 10:
-    im2col_nd_gpu_kernel<Dtype, 10>  // NOLINT_NEXT_LINE(whitespace/operators)
-        <<<CAFFE_GET_BLOCKS(num_kernels), CAFFE_CUDA_NUM_THREADS>>>(
-        num_kernels, data_im, im_shape, col_shape,
-        kernel_shape, pad, stride, dilation, data_col);
-    break;
-  default:
-    LOG(FATAL) << "im2col_nd_gpu does not support computation with "
-               << num_spatial_axes << " spatial axes";
-  }
-  CUDA_POST_KERNEL_CHECK;
-}
-
-// Explicit instantiation
-template void im2col_nd_gpu<float>(const float* data_im,
-    const int num_spatial_axes, const int col_size,
-    const int* im_shape, const int* col_shape,
-    const int* kernel_shape, const int* pad, const int* stride,
-    const int* dilation, float* data_col);
-template void im2col_nd_gpu<double>(const double* data_im,
-    const int num_spatial_axes, const int col_size,
-    const int* im_shape, const int* col_shape,
-    const int* kernel_shape, const int* pad, const int* stride,
-    const int* dilation, double* data_col);
-
-template <typename Dtype>
-__global__ void col2im_gpu_kernel(const int n, const Dtype* data_col,
-    const int height, const int width, const int channels,
-    const int kernel_h, const int kernel_w,
-    const int pad_h, const int pad_w,
-    const int stride_h, const int stride_w,
-    const int dilation_h, const int dilation_w,
-    const int height_col, const int width_col,
-    Dtype* data_im) {
-  CUDA_KERNEL_LOOP(index, n) {
-    Dtype val = 0;
-    const int w_im = index % width + pad_w;
-    const int h_im = (index / width) % height + pad_h;
-    const int c_im = index / (width * height);
-    int kernel_extent_w = (kernel_w - 1) * dilation_w + 1;
-    int kernel_extent_h = (kernel_h - 1) * dilation_h + 1;
-    // compute the start and end of the output
-    const int w_col_start =
-        (w_im < kernel_extent_w) ? 0 : (w_im - kernel_extent_w) / stride_w + 1;
-    const int w_col_end = min(w_im / stride_w + 1, width_col);
-    const int h_col_start =
-        (h_im < kernel_extent_h) ? 0 : (h_im - kernel_extent_h) / stride_h + 1;
-    const int h_col_end = min(h_im / stride_h + 1, height_col);
-    // TODO: use LCM of stride and dilation to avoid unnecessary loops
-    for (int h_col = h_col_start; h_col < h_col_end; h_col += 1) {
-      for (int w_col = w_col_start; w_col < w_col_end; w_col += 1) {
-        int h_k = (h_im - h_col * stride_h);
-        int w_k = (w_im - w_col * stride_w);
-        if (h_k % dilation_h == 0 && w_k % dilation_w == 0) {
-          h_k /= dilation_h;
-          w_k /= dilation_w;
-          int data_col_index = (((c_im * kernel_h + h_k) * kernel_w + w_k) *
-                                height_col + h_col) * width_col + w_col;
-          val += data_col[data_col_index];
-        }
-      }
-    }
-    data_im[index] = val;
-  }
-}
-
-template <typename Dtype>
-void col2im_gpu(const Dtype* data_col, const int channels,
-    const int height, const int width, const int kernel_h, const int kernel_w,
-    const int pad_h, const int pad_w, const int stride_h,
-    const int stride_w, const int dilation_h, const int dilation_w,
-    Dtype* data_im) {
-  int height_col = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) /
-      stride_h + 1;
-  int width_col = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) /
-      stride_w + 1;
-  int num_kernels = channels * height * width;
-  // To avoid involving atomic operations, we will launch one kernel per
-  // bottom dimension, and then in the kernel add up the top dimensions.
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  col2im_gpu_kernel<Dtype><<<CAFFE_GET_BLOCKS(num_kernels),
-                             CAFFE_CUDA_NUM_THREADS>>>(
-      num_kernels, data_col, height, width, channels, kernel_h, kernel_w,
-      pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
-      height_col, width_col, data_im);
-  CUDA_POST_KERNEL_CHECK;
-}
-
-// Explicit instantiation
-template void col2im_gpu<float>(const float* data_col, const int channels,
-    const int height, const int width, const int kernel_h, const int kernel_w,
-    const int pad_h, const int pad_w, const int stride_h,
-    const int stride_w, const int dilation_h, const int dilation_w,
-    float* data_im);
-template void col2im_gpu<double>(const double* data_col, const int channels,
-    const int height, const int width, const int kernel_h, const int kernel_w,
-    const int pad_h, const int pad_w, const int stride_h,
-    const int stride_w, const int dilation_h, const int dilation_w,
-    double* data_im);
-
-template <typename Dtype, int num_axes>
-__global__ void col2im_nd_gpu_kernel(const int n, const Dtype* data_col,
-    const int* im_shape, const int* col_shape,
-    const int* kernel_shape, const int* pad, const int* stride,
-    const int* dilation, Dtype* data_im) {
-  int d_im[num_axes];  // NOLINT(runtime/arrays)
-  int d_col_iter[num_axes];  // NOLINT(runtime/arrays)
-  int d_col_start[num_axes];  // NOLINT(runtime/arrays)
-  int d_col_end[num_axes];  // NOLINT(runtime/arrays)
-
-  __shared__ int shared_dilation[num_axes];
-  __shared__ int shared_kernel_shape[num_axes];
-  __shared__ int shared_pad[num_axes];
-  __shared__ int shared_stride[num_axes];
-  __shared__ int shared_col_shape[num_axes + 1];
-  __shared__ int shared_im_shape[num_axes + 1];
-
-  if (threadIdx.x < num_axes) {
-    shared_dilation[threadIdx.x] = dilation[threadIdx.x];
-    shared_kernel_shape[threadIdx.x] = kernel_shape[threadIdx.x];
-    shared_pad[threadIdx.x] = pad[threadIdx.x];
-    shared_stride[threadIdx.x] = stride[threadIdx.x];
-  }
-  if (threadIdx.x < num_axes + 1) {
-    shared_col_shape[threadIdx.x] = col_shape[threadIdx.x];
-    shared_im_shape[threadIdx.x] = im_shape[threadIdx.x];
-  }
-  __syncthreads();
-
-  CUDA_KERNEL_LOOP(index, n) {
-    // Initialize channel_in, computed in the loop below, with intermediate
-    // computations used to compute the spatial indices.
-    int c_im = index;
-    // Calculate d_im (image dimensions).
-    for (int i = num_axes - 1; i >= 0; --i) {
-      d_im[i] = c_im % shared_im_shape[i + 1] + shared_pad[i];
-      c_im /= shared_im_shape[i + 1];
-    }
-    // Calculate col start/end indices.
-    bool done = false;
-    for (int i = 0; i < num_axes; ++i) {
-      const int kernel_extent =
-          shared_dilation[i] * (shared_kernel_shape[i] - 1) + 1;
-      d_col_start[i] = d_col_iter[i] =
-          (d_im[i] < kernel_extent) ? 0 :
-          (d_im[i] - kernel_extent) / shared_stride[i] + 1;
-      d_col_end[i] =
-          min(d_im[i] / shared_stride[i] + 1, shared_col_shape[i + 1]);
-      if (d_col_start[i] >= d_col_end[i]) {
-        // Skip computation if the dimension is 0 at any spatial axis --
-        // final val will be 0.
-        data_im[index] = 0;
-        done = true;
-        break;  // for (int i = 0; i < num_axes; ++i)
-      }
-    }
-    if (done) {
-      continue;  // CUDA_KERNEL_LOOP(index, n)
-    }
-    // Loop over the col to compute the output val.
-    Dtype val = 0;
-    bool incremented = true;
-    bool skip = false;
-    do {
-      // Compute the final offset.
-      int final_offset = 0;
-      int kernel_shape_prod = 1;
-      int kernel_index;
-      for (int i = num_axes - 1; i >= 0; --i) {
-        kernel_index = d_im[i] - d_col_iter[i] * shared_stride[i];
-        if (kernel_index % shared_dilation[i]) {
-          skip = true;
-          break;
-        } else {
-          kernel_index /= shared_dilation[i];
-          final_offset += kernel_index * kernel_shape_prod;
-          kernel_shape_prod *= shared_kernel_shape[i];
-        }
-      }
-      if (!skip) {
-        final_offset += kernel_shape_prod * c_im;
-        for (int i = 0; i < num_axes; ++i) {
-          final_offset *= shared_col_shape[i + 1];
-          final_offset += d_col_iter[i];
-        }
-        val += data_col[final_offset];
-      }
-      skip = false;
-      incremented = false;
-      for (int i = num_axes - 1; i >= 0; --i) {
-        const int d_max = d_col_end[i];
-        if (d_col_iter[i] == d_max - 1) {
-          d_col_iter[i] = d_col_start[i];
-        } else {  // d_col_iter[i] < d_max - 1
-          ++d_col_iter[i];
-          incremented = true;
-          break;  // for (int i = num_axes - 1; i >= 0; --i)
-        }
-      }  // for (int i = num_axes - 1; i >= 0; --i)
-    }  while (incremented);
-    data_im[index] = val;
-  }  // CUDA_KERNEL_LOOP(index, n)
-}
-
-template <typename Dtype>
-void col2im_nd_gpu(const Dtype* data_col, const int num_spatial_axes,
-    const int im_size, const int* im_shape, const int* col_shape,
-    const int* kernel_shape, const int* pad, const int* stride,
-    const int* dilation, Dtype* data_im) {
-  // num_axes should be smaller than block size
-  DCHECK_LT(num_spatial_axes, CAFFE_CUDA_NUM_THREADS);
-  switch (num_spatial_axes) {
-  case 1:
-    col2im_nd_gpu_kernel<Dtype, 1>  // NOLINT_NEXT_LINE(whitespace/operators)
-          <<<CAFFE_GET_BLOCKS(im_size), CAFFE_CUDA_NUM_THREADS>>>(
-          im_size, data_col, im_shape, col_shape,
-          kernel_shape, pad, stride, dilation, data_im);
-    break;
-  case 2:
-    col2im_nd_gpu_kernel<Dtype, 2>  // NOLINT_NEXT_LINE(whitespace/operators)
-          <<<CAFFE_GET_BLOCKS(im_size), CAFFE_CUDA_NUM_THREADS>>>(
-          im_size, data_col, im_shape, col_shape,
-          kernel_shape, pad, stride, dilation, data_im);
-    break;
-  case 3:
-    col2im_nd_gpu_kernel<Dtype, 3>  // NOLINT_NEXT_LINE(whitespace/operators)
-          <<<CAFFE_GET_BLOCKS(im_size), CAFFE_CUDA_NUM_THREADS>>>(
-          im_size, data_col, im_shape, col_shape,
-          kernel_shape, pad, stride, dilation, data_im);
-    break;
-  case 4:
-    col2im_nd_gpu_kernel<Dtype, 4>  // NOLINT_NEXT_LINE(whitespace/operators)
-          <<<CAFFE_GET_BLOCKS(im_size), CAFFE_CUDA_NUM_THREADS>>>(
-          im_size, data_col, im_shape, col_shape,
-          kernel_shape, pad, stride, dilation, data_im);
-    break;
-  case 5:
-    col2im_nd_gpu_kernel<Dtype, 5>  // NOLINT_NEXT_LINE(whitespace/operators)
-          <<<CAFFE_GET_BLOCKS(im_size), CAFFE_CUDA_NUM_THREADS>>>(
-          im_size, data_col, im_shape, col_shape,
-          kernel_shape, pad, stride, dilation, data_im);
-    break;
-  case 6:
-    col2im_nd_gpu_kernel<Dtype, 6>  // NOLINT_NEXT_LINE(whitespace/operators)
-          <<<CAFFE_GET_BLOCKS(im_size), CAFFE_CUDA_NUM_THREADS>>>(
-          im_size, data_col, im_shape, col_shape,
-          kernel_shape, pad, stride, dilation, data_im);
-    break;
-  case 7:
-    col2im_nd_gpu_kernel<Dtype, 7>  // NOLINT_NEXT_LINE(whitespace/operators)
-          <<<CAFFE_GET_BLOCKS(im_size), CAFFE_CUDA_NUM_THREADS>>>(
-          im_size, data_col, im_shape, col_shape,
-          kernel_shape, pad, stride, dilation, data_im);
-    break;
-  case 8:
-    col2im_nd_gpu_kernel<Dtype, 8>  // NOLINT_NEXT_LINE(whitespace/operators)
-          <<<CAFFE_GET_BLOCKS(im_size), CAFFE_CUDA_NUM_THREADS>>>(
-          im_size, data_col, im_shape, col_shape,
-          kernel_shape, pad, stride, dilation, data_im);
-    break;
-  case 9:
-    col2im_nd_gpu_kernel<Dtype, 9>  // NOLINT_NEXT_LINE(whitespace/operators)
-          <<<CAFFE_GET_BLOCKS(im_size), CAFFE_CUDA_NUM_THREADS>>>(
-          im_size, data_col, im_shape, col_shape,
-          kernel_shape, pad, stride, dilation, data_im);
-    break;
-  case 10:
-    col2im_nd_gpu_kernel<Dtype, 10>  // NOLINT_NEXT_LINE(whitespace/operators)
-          <<<CAFFE_GET_BLOCKS(im_size), CAFFE_CUDA_NUM_THREADS>>>(
-          im_size, data_col, im_shape, col_shape,
-          kernel_shape, pad, stride, dilation, data_im);
-    break;
-  default:
-    LOG(FATAL) << "col2im_nd_gpu does not support computation with "
-               << num_spatial_axes << " spatial axes";
-  }
-  CUDA_POST_KERNEL_CHECK;
-}
-
-// Explicit instantiation
-template void col2im_nd_gpu<float>(const float* data_col,
-    const int num_spatial_axes, const int im_size,
-    const int* im_shape, const int* col_shape,
-    const int* kernel_shape, const int* pad, const int* stride,
-    const int* dilation, float* data_im);
-template void col2im_nd_gpu<double>(const double* data_col,
-    const int num_spatial_axes, const int im_size,
-    const int* im_shape, const int* col_shape,
-    const int* kernel_shape, const int* pad, const int* stride,
-    const int* dilation, double* data_im);
-
-}  // namespace caffe
diff --git a/src/caffe/util/io.cpp b/src/caffe/util/io.cpp
index b2f9b48..aeabe92 100644
--- a/src/caffe/util/io.cpp
+++ b/src/caffe/util/io.cpp
@@ -2,12 +2,6 @@
 #include <google/protobuf/io/coded_stream.h>
 #include <google/protobuf/io/zero_copy_stream_impl.h>
 #include <google/protobuf/text_format.h>
-#ifdef USE_OPENCV
-#include <opencv2/core/core.hpp>
-#include <opencv2/highgui/highgui.hpp>
-#include <opencv2/highgui/highgui_c.h>
-#include <opencv2/imgproc/imgproc.hpp>
-#endif  // USE_OPENCV
 #include <stdint.h>
 
 #include <algorithm>
@@ -19,8 +13,10 @@
 #include "caffe/proto/caffe.pb.h"
 #include "caffe/util/io.hpp"
 
+#ifdef WIN32
 #include <io.h>
 #define open _open
+#endif
 
 const int kProtoReadBytesLimit = INT_MAX;  // Max size of 2 GB minus 1 byte.
 
@@ -72,170 +68,4 @@ void WriteProtoToBinaryFile(const Message& proto, const char* filename) {
   CHECK(proto.SerializeToOstream(&output));
 }
 
-#ifdef USE_OPENCV
-cv::Mat ReadImageToCVMat(const string& filename,
-    const int height, const int width, const bool is_color) {
-  cv::Mat cv_img;
-  int cv_read_flag = (is_color ? CV_LOAD_IMAGE_COLOR :
-    CV_LOAD_IMAGE_GRAYSCALE);
-  cv::Mat cv_img_origin = cv::imread(filename, cv_read_flag);
-  if (!cv_img_origin.data) {
-    LOG(ERROR) << "Could not open or find file " << filename;
-    return cv_img_origin;
-  }
-  if (height > 0 && width > 0) {
-    cv::resize(cv_img_origin, cv_img, cv::Size(width, height));
-  } else {
-    cv_img = cv_img_origin;
-  }
-  return cv_img;
-}
-
-cv::Mat ReadImageToCVMat(const string& filename,
-    const int height, const int width) {
-  return ReadImageToCVMat(filename, height, width, true);
-}
-
-cv::Mat ReadImageToCVMat(const string& filename,
-    const bool is_color) {
-  return ReadImageToCVMat(filename, 0, 0, is_color);
-}
-
-cv::Mat ReadImageToCVMat(const string& filename) {
-  return ReadImageToCVMat(filename, 0, 0, true);
-}
-
-// Do the file extension and encoding match?
-static bool matchExt(const std::string & fn,
-                     std::string en) {
-  size_t p = fn.rfind('.');
-  std::string ext = p != fn.npos ? fn.substr(p) : fn;
-  std::transform(ext.begin(), ext.end(), ext.begin(), ::tolower);
-  std::transform(en.begin(), en.end(), en.begin(), ::tolower);
-  if ( ext == en )
-    return true;
-  if ( en == "jpg" && ext == "jpeg" )
-    return true;
-  return false;
-}
-
-bool ReadImageToDatum(const string& filename, const int label,
-    const int height, const int width, const bool is_color,
-    const std::string & encoding, Datum* datum) {
-  cv::Mat cv_img = ReadImageToCVMat(filename, height, width, is_color);
-  if (cv_img.data) {
-    if (encoding.size()) {
-      if ( (cv_img.channels() == 3) == is_color && !height && !width &&
-          matchExt(filename, encoding) )
-        return ReadFileToDatum(filename, label, datum);
-      std::vector<uchar> buf;
-      cv::imencode("."+encoding, cv_img, buf);
-      datum->set_data(std::string(reinterpret_cast<char*>(&buf[0]),
-                      buf.size()));
-      datum->set_label(label);
-      datum->set_encoded(true);
-      return true;
-    }
-    CVMatToDatum(cv_img, datum);
-    datum->set_label(label);
-    return true;
-  } else {
-    return false;
-  }
-}
-#endif  // USE_OPENCV
-
-bool ReadFileToDatum(const string& filename, const int label,
-    Datum* datum) {
-  std::streampos size;
-
-  fstream file(filename.c_str(), ios::in|ios::binary|ios::ate);
-  if (file.is_open()) {
-    size = file.tellg();
-    std::string buffer(size, ' ');
-    file.seekg(0, ios::beg);
-    file.read(&buffer[0], size);
-    file.close();
-    datum->set_data(buffer);
-    datum->set_label(label);
-    datum->set_encoded(true);
-    return true;
-  } else {
-    return false;
-  }
-}
-
-#ifdef USE_OPENCV
-cv::Mat DecodeDatumToCVMatNative(const Datum& datum) {
-  cv::Mat cv_img;
-  CHECK(datum.encoded()) << "Datum not encoded";
-  const string& data = datum.data();
-  std::vector<char> vec_data(data.c_str(), data.c_str() + data.size());
-  cv_img = cv::imdecode(vec_data, -1);
-  if (!cv_img.data) {
-    LOG(ERROR) << "Could not decode datum ";
-  }
-  return cv_img;
-}
-cv::Mat DecodeDatumToCVMat(const Datum& datum, bool is_color) {
-  cv::Mat cv_img;
-  CHECK(datum.encoded()) << "Datum not encoded";
-  const string& data = datum.data();
-  std::vector<char> vec_data(data.c_str(), data.c_str() + data.size());
-  int cv_read_flag = (is_color ? CV_LOAD_IMAGE_COLOR :
-    CV_LOAD_IMAGE_GRAYSCALE);
-  cv_img = cv::imdecode(vec_data, cv_read_flag);
-  if (!cv_img.data) {
-    LOG(ERROR) << "Could not decode datum ";
-  }
-  return cv_img;
-}
-
-// If Datum is encoded will decoded using DecodeDatumToCVMat and CVMatToDatum
-// If Datum is not encoded will do nothing
-bool DecodeDatumNative(Datum* datum) {
-  if (datum->encoded()) {
-    cv::Mat cv_img = DecodeDatumToCVMatNative((*datum));
-    CVMatToDatum(cv_img, datum);
-    return true;
-  } else {
-    return false;
-  }
-}
-bool DecodeDatum(Datum* datum, bool is_color) {
-  if (datum->encoded()) {
-    cv::Mat cv_img = DecodeDatumToCVMat((*datum), is_color);
-    CVMatToDatum(cv_img, datum);
-    return true;
-  } else {
-    return false;
-  }
-}
-
-void CVMatToDatum(const cv::Mat& cv_img, Datum* datum) {
-  CHECK(cv_img.depth() == CV_8U) << "Image data type must be unsigned byte";
-  datum->set_channels(cv_img.channels());
-  datum->set_height(cv_img.rows);
-  datum->set_width(cv_img.cols);
-  datum->clear_data();
-  datum->clear_float_data();
-  datum->set_encoded(false);
-  int datum_channels = datum->channels();
-  int datum_height = datum->height();
-  int datum_width = datum->width();
-  int datum_size = datum_channels * datum_height * datum_width;
-  std::string buffer(datum_size, ' ');
-  for (int h = 0; h < datum_height; ++h) {
-    const uchar* ptr = cv_img.ptr<uchar>(h);
-    int img_index = 0;
-    for (int w = 0; w < datum_width; ++w) {
-      for (int c = 0; c < datum_channels; ++c) {
-        int datum_index = (c * datum_height + h) * datum_width + w;
-        buffer[datum_index] = static_cast<char>(ptr[img_index++]);
-      }
-    }
-  }
-  datum->set_data(buffer);
-}
-#endif  // USE_OPENCV
 }  // namespace caffe
diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp
index 71c0227..e6ca4bf 100644
--- a/src/caffe/util/math_functions.cpp
+++ b/src/caffe/util/math_functions.cpp
@@ -1,7 +1,5 @@
-#include <boost/math/special_functions/next.hpp>
-#include <boost/random.hpp>
-
 #include <limits>
+#include <random>
 
 #include "caffe/common.hpp"
 #include "caffe/util/math_functions.hpp"
@@ -85,16 +83,7 @@ void caffe_add_scalar(const int N, const double alpha, double* Y) {
 template <typename Dtype>
 void caffe_copy(const int N, const Dtype* X, Dtype* Y) {
   if (X != Y) {
-    if (Caffe::mode() == Caffe::GPU) {
-#ifndef CPU_ONLY
-      // NOLINT_NEXT_LINE(caffe/alt_fn)
-      CUDA_CHECK(cudaMemcpy(Y, X, sizeof(Dtype) * N, cudaMemcpyDefault));
-#else
-      NO_GPU;
-#endif
-    } else {
-      memcpy(Y, X, sizeof(Dtype) * N);  // NOLINT(caffe/alt_fn)
-    }
+    memcpy(Y, X, sizeof(Dtype) * N);  // NOLINT(caffe/alt_fn)
   }
 }
 
@@ -232,8 +221,7 @@ unsigned int caffe_rng_rand() {
 
 template <typename Dtype>
 Dtype caffe_nextafter(const Dtype b) {
-  return boost::math::nextafter<Dtype>(
-      b, std::numeric_limits<Dtype>::max());
+  return std::nextafter(b, std::numeric_limits<Dtype>::max());
 }
 
 template
@@ -247,11 +235,10 @@ void caffe_rng_uniform(const int n, const Dtype a, const Dtype b, Dtype* r) {
   CHECK_GE(n, 0);
   CHECK(r);
   CHECK_LE(a, b);
-  boost::uniform_real<Dtype> random_distribution(a, caffe_nextafter<Dtype>(b));
-  boost::variate_generator<caffe::rng_t*, boost::uniform_real<Dtype> >
-      variate_generator(caffe_rng(), random_distribution);
+  std::uniform_real<Dtype> random_distribution(a, caffe_nextafter<Dtype>(b));
+  rng_t* engine = caffe_rng();
   for (int i = 0; i < n; ++i) {
-    r[i] = variate_generator();
+    r[i] = random_distribution(*engine);
   }
 }
 
@@ -269,11 +256,10 @@ void caffe_rng_gaussian(const int n, const Dtype a,
   CHECK_GE(n, 0);
   CHECK(r);
   CHECK_GT(sigma, 0);
-  boost::normal_distribution<Dtype> random_distribution(a, sigma);
-  boost::variate_generator<caffe::rng_t*, boost::normal_distribution<Dtype> >
-      variate_generator(caffe_rng(), random_distribution);
+  std::normal_distribution<Dtype> random_distribution(a, sigma);
+  rng_t* engine = caffe_rng();
   for (int i = 0; i < n; ++i) {
-    r[i] = variate_generator();
+    r[i] = random_distribution(*engine);
   }
 }
 
@@ -291,11 +277,10 @@ void caffe_rng_bernoulli(const int n, const Dtype p, int* r) {
   CHECK(r);
   CHECK_GE(p, 0);
   CHECK_LE(p, 1);
-  boost::bernoulli_distribution<Dtype> random_distribution(p);
-  boost::variate_generator<caffe::rng_t*, boost::bernoulli_distribution<Dtype> >
-      variate_generator(caffe_rng(), random_distribution);
+  std::bernoulli_distribution random_distribution(p);
+  rng_t* engine = caffe_rng();
   for (int i = 0; i < n; ++i) {
-    r[i] = variate_generator();
+    r[i] = random_distribution(*engine);
   }
 }
 
@@ -311,11 +296,10 @@ void caffe_rng_bernoulli(const int n, const Dtype p, unsigned int* r) {
   CHECK(r);
   CHECK_GE(p, 0);
   CHECK_LE(p, 1);
-  boost::bernoulli_distribution<Dtype> random_distribution(p);
-  boost::variate_generator<caffe::rng_t*, boost::bernoulli_distribution<Dtype> >
-      variate_generator(caffe_rng(), random_distribution);
+  std::bernoulli_distribution random_distribution(p);
+  rng_t* engine = caffe_rng();
   for (int i = 0; i < n; ++i) {
-    r[i] = static_cast<unsigned int>(variate_generator());
+    r[i] = static_cast<unsigned int>(random_distribution(*engine));
   }
 }
 
diff --git a/src/caffe/util/math_functions.cu b/src/caffe/util/math_functions.cu
deleted file mode 100644
index 4c58753..0000000
--- a/src/caffe/util/math_functions.cu
+++ /dev/null
@@ -1,418 +0,0 @@
-#include <math_functions.h>  // CUDA's, not caffe's, for fabs, signbit
-#include <thrust/device_vector.h>
-#include <thrust/functional.h>  // thrust::plus
-#include <thrust/reduce.h>
-
-#include <cmath>
-
-#include "caffe/common.hpp"
-#include "caffe/util/math_functions.hpp"
-
-namespace caffe {
-
-template <>
-void caffe_gpu_gemm<float>(const CBLAS_TRANSPOSE TransA,
-    const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
-    const float alpha, const float* A, const float* B, const float beta,
-    float* C) {
-  // Note that cublas follows fortran order.
-  int lda = (TransA == CblasNoTrans) ? K : M;
-  int ldb = (TransB == CblasNoTrans) ? N : K;
-  cublasOperation_t cuTransA =
-      (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  cublasOperation_t cuTransB =
-      (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  CUBLAS_CHECK(cublasSgemm(Caffe::cublas_handle(), cuTransB, cuTransA,
-      N, M, K, &alpha, B, ldb, A, lda, &beta, C, N));
-}
-
-template <>
-void caffe_gpu_gemm<double>(const CBLAS_TRANSPOSE TransA,
-    const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
-    const double alpha, const double* A, const double* B, const double beta,
-    double* C) {
-  // Note that cublas follows fortran order.
-  int lda = (TransA == CblasNoTrans) ? K : M;
-  int ldb = (TransB == CblasNoTrans) ? N : K;
-  cublasOperation_t cuTransA =
-      (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  cublasOperation_t cuTransB =
-      (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  CUBLAS_CHECK(cublasDgemm(Caffe::cublas_handle(), cuTransB, cuTransA,
-      N, M, K, &alpha, B, ldb, A, lda, &beta, C, N));
-}
-
-template <>
-void caffe_gpu_gemv<float>(const CBLAS_TRANSPOSE TransA, const int M,
-    const int N, const float alpha, const float* A, const float* x,
-    const float beta, float* y) {
-  cublasOperation_t cuTransA =
-      (TransA == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N;
-  CUBLAS_CHECK(cublasSgemv(Caffe::cublas_handle(), cuTransA, N, M, &alpha,
-      A, N, x, 1, &beta, y, 1));
-}
-
-template <>
-void caffe_gpu_gemv<double>(const CBLAS_TRANSPOSE TransA, const int M,
-    const int N, const double alpha, const double* A, const double* x,
-    const double beta, double* y) {
-  cublasOperation_t cuTransA =
-      (TransA == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N;
-  CUBLAS_CHECK(cublasDgemv(Caffe::cublas_handle(), cuTransA, N, M, &alpha,
-      A, N, x, 1, &beta, y, 1));
-}
-
-template <>
-void caffe_gpu_axpy<float>(const int N, const float alpha, const float* X,
-    float* Y) {
-  CUBLAS_CHECK(cublasSaxpy(Caffe::cublas_handle(), N, &alpha, X, 1, Y, 1));
-}
-
-template <>
-void caffe_gpu_axpy<double>(const int N, const double alpha, const double* X,
-    double* Y) {
-  CUBLAS_CHECK(cublasDaxpy(Caffe::cublas_handle(), N, &alpha, X, 1, Y, 1));
-}
-
-void caffe_gpu_memcpy(const size_t N, const void* X, void* Y) {
-  if (X != Y) {
-    CUDA_CHECK(cudaMemcpy(Y, X, N, cudaMemcpyDefault));  // NOLINT(caffe/alt_fn)
-  }
-}
-
-template <>
-void caffe_gpu_scal<float>(const int N, const float alpha, float *X) {
-  CUBLAS_CHECK(cublasSscal(Caffe::cublas_handle(), N, &alpha, X, 1));
-}
-
-template <>
-void caffe_gpu_scal<double>(const int N, const double alpha, double *X) {
-  CUBLAS_CHECK(cublasDscal(Caffe::cublas_handle(), N, &alpha, X, 1));
-}
-
-template <>
-void caffe_gpu_axpby<float>(const int N, const float alpha, const float* X,
-    const float beta, float* Y) {
-  caffe_gpu_scal<float>(N, beta, Y);
-  caffe_gpu_axpy<float>(N, alpha, X, Y);
-}
-
-template <>
-void caffe_gpu_axpby<double>(const int N, const double alpha, const double* X,
-    const double beta, double* Y) {
-  caffe_gpu_scal<double>(N, beta, Y);
-  caffe_gpu_axpy<double>(N, alpha, X, Y);
-}
-
-template <>
-void caffe_gpu_dot<float>(const int n, const float* x, const float* y,
-    float* out) {
-  CUBLAS_CHECK(cublasSdot(Caffe::cublas_handle(), n, x, 1, y, 1, out));
-}
-
-template <>
-void caffe_gpu_dot<double>(const int n, const double* x, const double* y,
-    double * out) {
-  CUBLAS_CHECK(cublasDdot(Caffe::cublas_handle(), n, x, 1, y, 1, out));
-}
-
-template <>
-void caffe_gpu_asum<float>(const int n, const float* x, float* y) {
-  CUBLAS_CHECK(cublasSasum(Caffe::cublas_handle(), n, x, 1, y));
-}
-
-template <>
-void caffe_gpu_asum<double>(const int n, const double* x, double* y) {
-  CUBLAS_CHECK(cublasDasum(Caffe::cublas_handle(), n, x, 1, y));
-}
-
-template <>
-void caffe_gpu_scale<float>(const int n, const float alpha, const float *x,
-                            float* y) {
-  CUBLAS_CHECK(cublasScopy(Caffe::cublas_handle(), n, x, 1, y, 1));
-  CUBLAS_CHECK(cublasSscal(Caffe::cublas_handle(), n, &alpha, y, 1));
-}
-
-template <>
-void caffe_gpu_scale<double>(const int n, const double alpha, const double *x,
-                             double* y) {
-  CUBLAS_CHECK(cublasDcopy(Caffe::cublas_handle(), n, x, 1, y, 1));
-  CUBLAS_CHECK(cublasDscal(Caffe::cublas_handle(), n, &alpha, y, 1));
-}
-
-template <typename Dtype>
-__global__ void set_kernel(const int n, const Dtype alpha, Dtype* y) {
-  CUDA_KERNEL_LOOP(index, n) {
-    y[index] = alpha;
-  }
-}
-
-template <typename Dtype>
-void caffe_gpu_set(const int N, const Dtype alpha, Dtype* Y) {
-  if (alpha == 0) {
-    CUDA_CHECK(cudaMemset(Y, 0, sizeof(Dtype) * N));  // NOLINT(caffe/alt_fn)
-    return;
-  }
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  set_kernel<Dtype><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
-      N, alpha, Y);
-}
-
-template void caffe_gpu_set<int>(const int N, const int alpha, int* Y);
-template void caffe_gpu_set<float>(const int N, const float alpha, float* Y);
-template void caffe_gpu_set<double>(const int N, const double alpha, double* Y);
-
-template <typename Dtype>
-__global__ void add_scalar_kernel(const int n, const Dtype alpha, Dtype* y) {
-  CUDA_KERNEL_LOOP(index, n) {
-    y[index] += alpha;
-  }
-}
-
-template <>
-void caffe_gpu_add_scalar(const int N, const float alpha, float* Y) {
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  add_scalar_kernel<float><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
-      N, alpha, Y);
-}
-
-template <>
-void caffe_gpu_add_scalar(const int N, const double alpha, double* Y) {
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  add_scalar_kernel<double><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
-      N, alpha, Y);
-}
-
-template <typename Dtype>
-__global__ void add_kernel(const int n, const Dtype* a,
-    const Dtype* b, Dtype* y) {
-  CUDA_KERNEL_LOOP(index, n) {
-    y[index] = a[index] + b[index];
-  }
-}
-
-template <>
-void caffe_gpu_add<float>(const int N, const float* a, const float* b,
-    float* y) {
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  add_kernel<float><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
-      N, a, b, y);
-}
-
-template <>
-void caffe_gpu_add<double>(const int N, const double* a, const double* b,
-    double* y) {
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  add_kernel<double><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
-      N, a, b, y);
-}
-
-template <typename Dtype>
-__global__ void sub_kernel(const int n, const Dtype* a,
-    const Dtype* b, Dtype* y) {
-  CUDA_KERNEL_LOOP(index, n) {
-    y[index] = a[index] - b[index];
-  }
-}
-
-template <>
-void caffe_gpu_sub<float>(const int N, const float* a, const float* b,
-    float* y) {
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  sub_kernel<float><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
-      N, a, b, y);
-}
-
-template <>
-void caffe_gpu_sub<double>(const int N, const double* a, const double* b,
-    double* y) {
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  sub_kernel<double><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
-      N, a, b, y);
-}
-
-template <typename Dtype>
-__global__ void mul_kernel(const int n, const Dtype* a,
-    const Dtype* b, Dtype* y) {
-  CUDA_KERNEL_LOOP(index, n) {
-    y[index] = a[index] * b[index];
-  }
-}
-
-template <>
-void caffe_gpu_mul<float>(const int N, const float* a,
-    const float* b, float* y) {
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  mul_kernel<float><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
-      N, a, b, y);
-}
-
-template <>
-void caffe_gpu_mul<double>(const int N, const double* a,
-    const double* b, double* y) {
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  mul_kernel<double><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
-      N, a, b, y);
-}
-
-template <typename Dtype>
-__global__ void div_kernel(const int n, const Dtype* a,
-    const Dtype* b, Dtype* y) {
-  CUDA_KERNEL_LOOP(index, n) {
-    y[index] = a[index] / b[index];
-  }
-}
-
-template <>
-void caffe_gpu_div<float>(const int N, const float* a,
-    const float* b, float* y) {
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  div_kernel<float><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
-      N, a, b, y);
-}
-
-template <>
-void caffe_gpu_div<double>(const int N, const double* a,
-    const double* b, double* y) {
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  div_kernel<double><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
-      N, a, b, y);
-}
-
-template <typename Dtype>
-__global__ void abs_kernel(const int n, const Dtype* a, Dtype* y) {
-  CUDA_KERNEL_LOOP(index, n) {
-    y[index] = abs(a[index]);
-  }
-}
-
-template <>
-void caffe_gpu_abs<float>(const int N, const float* a, float* y) {
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  abs_kernel<float><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
-      N, a, y);
-}
-
-template <>
-void caffe_gpu_abs<double>(const int N, const double* a, double* y) {
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  abs_kernel<double><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
-      N, a, y);
-}
-
-
-template <typename Dtype>
-__global__ void exp_kernel(const int n, const Dtype* a, Dtype* y) {
-  CUDA_KERNEL_LOOP(index, n) {
-    y[index] = exp(a[index]);
-  }
-}
-
-template <>
-void caffe_gpu_exp<float>(const int N, const float* a, float* y) {
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  exp_kernel<float><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
-      N, a, y);
-}
-
-template <>
-void caffe_gpu_exp<double>(const int N, const double* a, double* y) {
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  exp_kernel<double><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
-      N, a, y);
-}
-
-template <typename Dtype>
-__global__ void log_kernel(const int n, const Dtype* a, Dtype* y) {
-  CUDA_KERNEL_LOOP(index, n) {
-    y[index] = log(a[index]);
-  }
-}
-
-template <>
-void caffe_gpu_log<float>(const int N, const float* a, float* y) {
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  log_kernel<float><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
-      N, a, y);
-}
-
-template <>
-void caffe_gpu_log<double>(const int N, const double* a, double* y) {
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  log_kernel<double><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
-      N, a, y);
-}
-
-template <typename Dtype>
-__global__ void powx_kernel(const int n, const Dtype* a,
-    const Dtype alpha, Dtype* y) {
-  CUDA_KERNEL_LOOP(index, n) {
-    y[index] = pow(a[index], alpha);
-  }
-}
-
-template <>
-void caffe_gpu_powx<float>(const int N, const float* a,
-    const float alpha, float* y) {
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  powx_kernel<float><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
-      N, a, alpha, y);
-}
-
-template <>
-void caffe_gpu_powx<double>(const int N, const double* a,
-    const double alpha, double* y) {
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  powx_kernel<double><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
-      N, a, alpha, y);
-}
-
-DEFINE_AND_INSTANTIATE_GPU_UNARY_FUNC(sign, y[index] = (Dtype(0) < x[index])
-                                      - (x[index] < Dtype(0)));
-DEFINE_AND_INSTANTIATE_GPU_UNARY_FUNC(sgnbit, y[index] = signbit(x[index]));
-
-void caffe_gpu_rng_uniform(const int n, unsigned int* r) {
-  CURAND_CHECK(curandGenerate(Caffe::curand_generator(), r, n));
-}
-
-template <>
-void caffe_gpu_rng_uniform<float>(const int n, const float a, const float b,
-                                  float* r) {
-  CURAND_CHECK(curandGenerateUniform(Caffe::curand_generator(), r, n));
-  const float range = b - a;
-  if (range != static_cast<float>(1)) {
-    caffe_gpu_scal(n, range, r);
-  }
-  if (a != static_cast<float>(0)) {
-    caffe_gpu_add_scalar(n, a, r);
-  }
-}
-
-template <>
-void caffe_gpu_rng_uniform<double>(const int n, const double a, const double b,
-                                   double* r) {
-  CURAND_CHECK(curandGenerateUniformDouble(Caffe::curand_generator(), r, n));
-  const double range = b - a;
-  if (range != static_cast<double>(1)) {
-    caffe_gpu_scal(n, range, r);
-  }
-  if (a != static_cast<double>(0)) {
-    caffe_gpu_add_scalar(n, a, r);
-  }
-}
-
-template <>
-void caffe_gpu_rng_gaussian(const int n, const float mu, const float sigma,
-                            float* r) {
-  CURAND_CHECK(
-      curandGenerateNormal(Caffe::curand_generator(), r, n, mu, sigma));
-}
-
-template <>
-void caffe_gpu_rng_gaussian(const int n, const double mu, const double sigma,
-                            double* r) {
-  CURAND_CHECK(
-      curandGenerateNormalDouble(Caffe::curand_generator(), r, n, mu, sigma));
-}
-
-}  // namespace caffe
diff --git a/src/caffe/util/signal_handler.cpp b/src/caffe/util/signal_handler.cpp
deleted file mode 100644
index c3539f6..0000000
--- a/src/caffe/util/signal_handler.cpp
+++ /dev/null
@@ -1,115 +0,0 @@
-#include <boost/bind.hpp>
-#include <glog/logging.h>
-
-#include <signal.h>
-#include <csignal>
-
-#include "caffe/util/signal_handler.h"
-
-//namespace {
-//  static volatile sig_atomic_t got_sigint = false;
-//  static volatile sig_atomic_t got_sighup = false;
-//  static bool already_hooked_up = false;
-//
-//  void handle_signal(int signal) {
-//    switch (signal) {
-//    case SIGHUP:
-//      got_sighup = true;
-//      break;
-//    case SIGINT:
-//      got_sigint = true;
-//      break;
-//    }
-//  }
-//
-//  void HookupHandler() {
-//    if (already_hooked_up) {
-//      LOG(FATAL) << "Tried to hookup signal handlers more than once.";
-//    }
-//    already_hooked_up = true;
-//
-//    struct sigaction sa;
-//    // Setup the handler
-//    sa.sa_handler = &handle_signal;
-//    // Restart the system call, if at all possible
-//    sa.sa_flags = SA_RESTART;
-//    // Block every signal during the handler
-//    sigfillset(&sa.sa_mask);
-//    // Intercept SIGHUP and SIGINT
-//    if (sigaction(SIGHUP, &sa, NULL) == -1) {
-//      LOG(FATAL) << "Cannot install SIGHUP handler.";
-//    }
-//    if (sigaction(SIGINT, &sa, NULL) == -1) {
-//      LOG(FATAL) << "Cannot install SIGINT handler.";
-//    }
-//  }
-//
-//  // Set the signal handlers to the default.
-//  void UnhookHandler() {
-//    if (already_hooked_up) {
-//      struct sigaction sa;
-//      // Setup the sighub handler
-//      sa.sa_handler = SIG_DFL;
-//      // Restart the system call, if at all possible
-//      sa.sa_flags = SA_RESTART;
-//      // Block every signal during the handler
-//      sigfillset(&sa.sa_mask);
-//      // Intercept SIGHUP and SIGINT
-//      if (sigaction(SIGHUP, &sa, NULL) == -1) {
-//        LOG(FATAL) << "Cannot uninstall SIGHUP handler.";
-//      }
-//      if (sigaction(SIGINT, &sa, NULL) == -1) {
-//        LOG(FATAL) << "Cannot uninstall SIGINT handler.";
-//      }
-//
-//      already_hooked_up = false;
-//    }
-//  }
-//
-//  // Return true iff a SIGINT has been received since the last time this
-//  // function was called.
-//  bool GotSIGINT() {
-//    bool result = got_sigint;
-//    got_sigint = false;
-//    return result;
-//  }
-//
-//  // Return true iff a SIGHUP has been received since the last time this
-//  // function was called.
-//  bool GotSIGHUP() {
-//    bool result = got_sighup;
-//    got_sighup = false;
-//    return result;
-//  }
-//}  // namespace
-//
-//namespace caffe {
-//
-//SignalHandler::SignalHandler(SolverAction::Enum SIGINT_action,
-//                             SolverAction::Enum SIGHUP_action):
-//  SIGINT_action_(SIGINT_action),
-//  SIGHUP_action_(SIGHUP_action) {
-//  HookupHandler();
-//}
-//
-//SignalHandler::~SignalHandler() {
-//  UnhookHandler();
-//}
-//
-//SolverAction::Enum SignalHandler::CheckForSignals() const {
-//  if (GotSIGHUP()) {
-//    return SIGHUP_action_;
-//  }
-//  if (GotSIGINT()) {
-//    return SIGINT_action_;
-//  }
-//  return SolverAction::NONE;
-//}
-//
-//// Return the function that the solver can use to find out if a snapshot or
-//// early exit is being requested.
-//ActionCallback SignalHandler::GetActionFunction() {
-//  return boost::bind(&SignalHandler::CheckForSignals, this);
-//}
-//
-//}  // namespace caffe